• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" codecs -- Python Codec Registry, API and helpers.
2
3
4Written by Marc-Andre Lemburg (mal@lemburg.com).
5
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8"""#"
9
10import __builtin__, sys
11
12### Registry and builtin stateless codec functions
13
14try:
15    from _codecs import *
16except ImportError, why:
17    raise SystemError('Failed to load the builtin codecs: %s' % why)
18
19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",
20           "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",
21           "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",
22           "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",
23           "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",
24           "StreamReader", "StreamWriter",
25           "StreamReaderWriter", "StreamRecoder",
26           "getencoder", "getdecoder", "getincrementalencoder",
27           "getincrementaldecoder", "getreader", "getwriter",
28           "encode", "decode", "iterencode", "iterdecode",
29           "strict_errors", "ignore_errors", "replace_errors",
30           "xmlcharrefreplace_errors", "backslashreplace_errors",
31           "register_error", "lookup_error"]
32
33### Constants
34
35#
36# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)
37# and its possible byte string values
38# for UTF8/UTF16/UTF32 output and little/big endian machines
39#
40
41# UTF-8
42BOM_UTF8 = '\xef\xbb\xbf'
43
44# UTF-16, little endian
45BOM_LE = BOM_UTF16_LE = '\xff\xfe'
46
47# UTF-16, big endian
48BOM_BE = BOM_UTF16_BE = '\xfe\xff'
49
50# UTF-32, little endian
51BOM_UTF32_LE = '\xff\xfe\x00\x00'
52
53# UTF-32, big endian
54BOM_UTF32_BE = '\x00\x00\xfe\xff'
55
56if sys.byteorder == 'little':
57
58    # UTF-16, native endianness
59    BOM = BOM_UTF16 = BOM_UTF16_LE
60
61    # UTF-32, native endianness
62    BOM_UTF32 = BOM_UTF32_LE
63
64else:
65
66    # UTF-16, native endianness
67    BOM = BOM_UTF16 = BOM_UTF16_BE
68
69    # UTF-32, native endianness
70    BOM_UTF32 = BOM_UTF32_BE
71
72# Old broken names (don't use in new code)
73BOM32_LE = BOM_UTF16_LE
74BOM32_BE = BOM_UTF16_BE
75BOM64_LE = BOM_UTF32_LE
76BOM64_BE = BOM_UTF32_BE
77
78
79### Codec base classes (defining the API)
80
81class CodecInfo(tuple):
82    """Codec details when looking up the codec registry"""
83
84    # Private API to allow Python to blacklist the known non-Unicode
85    # codecs in the standard library. A more general mechanism to
86    # reliably distinguish test encodings from other codecs will hopefully
87    # be defined for Python 3.5
88    #
89    # See http://bugs.python.org/issue19619
90    _is_text_encoding = True # Assume codecs are text encodings by default
91
92    def __new__(cls, encode, decode, streamreader=None, streamwriter=None,
93        incrementalencoder=None, incrementaldecoder=None, name=None,
94        _is_text_encoding=None):
95        self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))
96        self.name = name
97        self.encode = encode
98        self.decode = decode
99        self.incrementalencoder = incrementalencoder
100        self.incrementaldecoder = incrementaldecoder
101        self.streamwriter = streamwriter
102        self.streamreader = streamreader
103        if _is_text_encoding is not None:
104            self._is_text_encoding = _is_text_encoding
105        return self
106
107    def __repr__(self):
108        return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
109
110class Codec:
111
112    """ Defines the interface for stateless encoders/decoders.
113
114        The .encode()/.decode() methods may use different error
115        handling schemes by providing the errors argument. These
116        string values are predefined:
117
118         'strict' - raise a ValueError error (or a subclass)
119         'ignore' - ignore the character and continue with the next
120         'replace' - replace with a suitable replacement character;
121                    Python will use the official U+FFFD REPLACEMENT
122                    CHARACTER for the builtin Unicode codecs on
123                    decoding and '?' on encoding.
124         'xmlcharrefreplace' - Replace with the appropriate XML
125                               character reference (only for encoding).
126         'backslashreplace'  - Replace with backslashed escape sequences
127                               (only for encoding).
128
129        The set of allowed values can be extended via register_error.
130
131    """
132    def encode(self, input, errors='strict'):
133
134        """ Encodes the object input and returns a tuple (output
135            object, length consumed).
136
137            errors defines the error handling to apply. It defaults to
138            'strict' handling.
139
140            The method may not store state in the Codec instance. Use
141            StreamWriter for codecs which have to keep state in order to
142            make encoding efficient.
143
144            The encoder must be able to handle zero length input and
145            return an empty object of the output object type in this
146            situation.
147
148        """
149        raise NotImplementedError
150
151    def decode(self, input, errors='strict'):
152
153        """ Decodes the object input and returns a tuple (output
154            object, length consumed).
155
156            input must be an object which provides the bf_getreadbuf
157            buffer slot. Python strings, buffer objects and memory
158            mapped files are examples of objects providing this slot.
159
160            errors defines the error handling to apply. It defaults to
161            'strict' handling.
162
163            The method may not store state in the Codec instance. Use
164            StreamReader for codecs which have to keep state in order to
165            make decoding efficient.
166
167            The decoder must be able to handle zero length input and
168            return an empty object of the output object type in this
169            situation.
170
171        """
172        raise NotImplementedError
173
174class IncrementalEncoder(object):
175    """
176    An IncrementalEncoder encodes an input in multiple steps. The input can be
177    passed piece by piece to the encode() method. The IncrementalEncoder remembers
178    the state of the Encoding process between calls to encode().
179    """
180    def __init__(self, errors='strict'):
181        """
182        Creates an IncrementalEncoder instance.
183
184        The IncrementalEncoder may use different error handling schemes by
185        providing the errors keyword argument. See the module docstring
186        for a list of possible values.
187        """
188        self.errors = errors
189        self.buffer = ""
190
191    def encode(self, input, final=False):
192        """
193        Encodes input and returns the resulting object.
194        """
195        raise NotImplementedError
196
197    def reset(self):
198        """
199        Resets the encoder to the initial state.
200        """
201
202    def getstate(self):
203        """
204        Return the current state of the encoder.
205        """
206        return 0
207
208    def setstate(self, state):
209        """
210        Set the current state of the encoder. state must have been
211        returned by getstate().
212        """
213
214class BufferedIncrementalEncoder(IncrementalEncoder):
215    """
216    This subclass of IncrementalEncoder can be used as the baseclass for an
217    incremental encoder if the encoder must keep some of the output in a
218    buffer between calls to encode().
219    """
220    def __init__(self, errors='strict'):
221        IncrementalEncoder.__init__(self, errors)
222        self.buffer = "" # unencoded input that is kept between calls to encode()
223
224    def _buffer_encode(self, input, errors, final):
225        # Overwrite this method in subclasses: It must encode input
226        # and return an (output, length consumed) tuple
227        raise NotImplementedError
228
229    def encode(self, input, final=False):
230        # encode input (taking the buffer into account)
231        data = self.buffer + input
232        (result, consumed) = self._buffer_encode(data, self.errors, final)
233        # keep unencoded input until the next call
234        self.buffer = data[consumed:]
235        return result
236
237    def reset(self):
238        IncrementalEncoder.reset(self)
239        self.buffer = ""
240
241    def getstate(self):
242        return self.buffer or 0
243
244    def setstate(self, state):
245        self.buffer = state or ""
246
247class IncrementalDecoder(object):
248    """
249    An IncrementalDecoder decodes an input in multiple steps. The input can be
250    passed piece by piece to the decode() method. The IncrementalDecoder
251    remembers the state of the decoding process between calls to decode().
252    """
253    def __init__(self, errors='strict'):
254        """
255        Creates an IncrementalDecoder instance.
256
257        The IncrementalDecoder may use different error handling schemes by
258        providing the errors keyword argument. See the module docstring
259        for a list of possible values.
260        """
261        self.errors = errors
262
263    def decode(self, input, final=False):
264        """
265        Decodes input and returns the resulting object.
266        """
267        raise NotImplementedError
268
269    def reset(self):
270        """
271        Resets the decoder to the initial state.
272        """
273
274    def getstate(self):
275        """
276        Return the current state of the decoder.
277
278        This must be a (buffered_input, additional_state_info) tuple.
279        buffered_input must be a bytes object containing bytes that
280        were passed to decode() that have not yet been converted.
281        additional_state_info must be a non-negative integer
282        representing the state of the decoder WITHOUT yet having
283        processed the contents of buffered_input.  In the initial state
284        and after reset(), getstate() must return (b"", 0).
285        """
286        return (b"", 0)
287
288    def setstate(self, state):
289        """
290        Set the current state of the decoder.
291
292        state must have been returned by getstate().  The effect of
293        setstate((b"", 0)) must be equivalent to reset().
294        """
295
296class BufferedIncrementalDecoder(IncrementalDecoder):
297    """
298    This subclass of IncrementalDecoder can be used as the baseclass for an
299    incremental decoder if the decoder must be able to handle incomplete byte
300    sequences.
301    """
302    def __init__(self, errors='strict'):
303        IncrementalDecoder.__init__(self, errors)
304        self.buffer = "" # undecoded input that is kept between calls to decode()
305
306    def _buffer_decode(self, input, errors, final):
307        # Overwrite this method in subclasses: It must decode input
308        # and return an (output, length consumed) tuple
309        raise NotImplementedError
310
311    def decode(self, input, final=False):
312        # decode input (taking the buffer into account)
313        data = self.buffer + input
314        (result, consumed) = self._buffer_decode(data, self.errors, final)
315        # keep undecoded input until the next call
316        self.buffer = data[consumed:]
317        return result
318
319    def reset(self):
320        IncrementalDecoder.reset(self)
321        self.buffer = ""
322
323    def getstate(self):
324        # additional state info is always 0
325        return (self.buffer, 0)
326
327    def setstate(self, state):
328        # ignore additional state info
329        self.buffer = state[0]
330
331#
332# The StreamWriter and StreamReader class provide generic working
333# interfaces which can be used to implement new encoding submodules
334# very easily. See encodings/utf_8.py for an example on how this is
335# done.
336#
337
338class StreamWriter(Codec):
339
340    def __init__(self, stream, errors='strict'):
341
342        """ Creates a StreamWriter instance.
343
344            stream must be a file-like object open for writing
345            (binary) data.
346
347            The StreamWriter may use different error handling
348            schemes by providing the errors keyword argument. These
349            parameters are predefined:
350
351             'strict' - raise a ValueError (or a subclass)
352             'ignore' - ignore the character and continue with the next
353             'replace'- replace with a suitable replacement character
354             'xmlcharrefreplace' - Replace with the appropriate XML
355                                   character reference.
356             'backslashreplace'  - Replace with backslashed escape
357                                   sequences (only for encoding).
358
359            The set of allowed parameter values can be extended via
360            register_error.
361        """
362        self.stream = stream
363        self.errors = errors
364
365    def write(self, object):
366
367        """ Writes the object's contents encoded to self.stream.
368        """
369        data, consumed = self.encode(object, self.errors)
370        self.stream.write(data)
371
372    def writelines(self, list):
373
374        """ Writes the concatenated list of strings to the stream
375            using .write().
376        """
377        self.write(''.join(list))
378
379    def reset(self):
380
381        """ Flushes and resets the codec buffers used for keeping state.
382
383            Calling this method should ensure that the data on the
384            output is put into a clean state, that allows appending
385            of new fresh data without having to rescan the whole
386            stream to recover state.
387
388        """
389        pass
390
391    def seek(self, offset, whence=0):
392        self.stream.seek(offset, whence)
393        if whence == 0 and offset == 0:
394            self.reset()
395
396    def __getattr__(self, name,
397                    getattr=getattr):
398
399        """ Inherit all other methods from the underlying stream.
400        """
401        return getattr(self.stream, name)
402
403    def __enter__(self):
404        return self
405
406    def __exit__(self, type, value, tb):
407        self.stream.close()
408
409###
410
411class StreamReader(Codec):
412
413    def __init__(self, stream, errors='strict'):
414
415        """ Creates a StreamReader instance.
416
417            stream must be a file-like object open for reading
418            (binary) data.
419
420            The StreamReader may use different error handling
421            schemes by providing the errors keyword argument. These
422            parameters are predefined:
423
424             'strict' - raise a ValueError (or a subclass)
425             'ignore' - ignore the character and continue with the next
426             'replace'- replace with a suitable replacement character;
427
428            The set of allowed parameter values can be extended via
429            register_error.
430        """
431        self.stream = stream
432        self.errors = errors
433        self.bytebuffer = ""
434        # For str->str decoding this will stay a str
435        # For str->unicode decoding the first read will promote it to unicode
436        self.charbuffer = ""
437        self.linebuffer = None
438
439    def decode(self, input, errors='strict'):
440        raise NotImplementedError
441
442    def read(self, size=-1, chars=-1, firstline=False):
443
444        """ Decodes data from the stream self.stream and returns the
445            resulting object.
446
447            chars indicates the number of characters to read from the
448            stream. read() will never return more than chars
449            characters, but it might return less, if there are not enough
450            characters available.
451
452            size indicates the approximate maximum number of bytes to
453            read from the stream for decoding purposes. The decoder
454            can modify this setting as appropriate. The default value
455            -1 indicates to read and decode as much as possible.  size
456            is intended to prevent having to decode huge files in one
457            step.
458
459            If firstline is true, and a UnicodeDecodeError happens
460            after the first line terminator in the input only the first line
461            will be returned, the rest of the input will be kept until the
462            next call to read().
463
464            The method should use a greedy read strategy meaning that
465            it should read as much data as is allowed within the
466            definition of the encoding and the given size, e.g.  if
467            optional encoding endings or state markers are available
468            on the stream, these should be read too.
469        """
470        # If we have lines cached, first merge them back into characters
471        if self.linebuffer:
472            self.charbuffer = "".join(self.linebuffer)
473            self.linebuffer = None
474
475        # read until we get the required number of characters (if available)
476        while True:
477            # can the request be satisfied from the character buffer?
478            if chars >= 0:
479                if len(self.charbuffer) >= chars:
480                    break
481            elif size >= 0:
482                if len(self.charbuffer) >= size:
483                    break
484            # we need more data
485            if size < 0:
486                newdata = self.stream.read()
487            else:
488                newdata = self.stream.read(size)
489            # decode bytes (those remaining from the last call included)
490            data = self.bytebuffer + newdata
491            try:
492                newchars, decodedbytes = self.decode(data, self.errors)
493            except UnicodeDecodeError, exc:
494                if firstline:
495                    newchars, decodedbytes = self.decode(data[:exc.start], self.errors)
496                    lines = newchars.splitlines(True)
497                    if len(lines)<=1:
498                        raise
499                else:
500                    raise
501            # keep undecoded bytes until the next call
502            self.bytebuffer = data[decodedbytes:]
503            # put new characters in the character buffer
504            self.charbuffer += newchars
505            # there was no data available
506            if not newdata:
507                break
508        if chars < 0:
509            # Return everything we've got
510            result = self.charbuffer
511            self.charbuffer = ""
512        else:
513            # Return the first chars characters
514            result = self.charbuffer[:chars]
515            self.charbuffer = self.charbuffer[chars:]
516        return result
517
518    def readline(self, size=None, keepends=True):
519
520        """ Read one line from the input stream and return the
521            decoded data.
522
523            size, if given, is passed as size argument to the
524            read() method.
525
526        """
527        # If we have lines cached from an earlier read, return
528        # them unconditionally
529        if self.linebuffer:
530            line = self.linebuffer[0]
531            del self.linebuffer[0]
532            if len(self.linebuffer) == 1:
533                # revert to charbuffer mode; we might need more data
534                # next time
535                self.charbuffer = self.linebuffer[0]
536                self.linebuffer = None
537            if not keepends:
538                line = line.splitlines(False)[0]
539            return line
540
541        readsize = size or 72
542        line = ""
543        # If size is given, we call read() only once
544        while True:
545            data = self.read(readsize, firstline=True)
546            if data:
547                # If we're at a "\r" read one extra character (which might
548                # be a "\n") to get a proper line ending. If the stream is
549                # temporarily exhausted we return the wrong line ending.
550                if data.endswith("\r"):
551                    data += self.read(size=1, chars=1)
552
553            line += data
554            lines = line.splitlines(True)
555            if lines:
556                if len(lines) > 1:
557                    # More than one line result; the first line is a full line
558                    # to return
559                    line = lines[0]
560                    del lines[0]
561                    if len(lines) > 1:
562                        # cache the remaining lines
563                        lines[-1] += self.charbuffer
564                        self.linebuffer = lines
565                        self.charbuffer = None
566                    else:
567                        # only one remaining line, put it back into charbuffer
568                        self.charbuffer = lines[0] + self.charbuffer
569                    if not keepends:
570                        line = line.splitlines(False)[0]
571                    break
572                line0withend = lines[0]
573                line0withoutend = lines[0].splitlines(False)[0]
574                if line0withend != line0withoutend: # We really have a line end
575                    # Put the rest back together and keep it until the next call
576                    self.charbuffer = "".join(lines[1:]) + self.charbuffer
577                    if keepends:
578                        line = line0withend
579                    else:
580                        line = line0withoutend
581                    break
582            # we didn't get anything or this was our only try
583            if not data or size is not None:
584                if line and not keepends:
585                    line = line.splitlines(False)[0]
586                break
587            if readsize<8000:
588                readsize *= 2
589        return line
590
591    def readlines(self, sizehint=None, keepends=True):
592
593        """ Read all lines available on the input stream
594            and return them as list of lines.
595
596            Line breaks are implemented using the codec's decoder
597            method and are included in the list entries.
598
599            sizehint, if given, is ignored since there is no efficient
600            way to finding the true end-of-line.
601
602        """
603        data = self.read()
604        return data.splitlines(keepends)
605
606    def reset(self):
607
608        """ Resets the codec buffers used for keeping state.
609
610            Note that no stream repositioning should take place.
611            This method is primarily intended to be able to recover
612            from decoding errors.
613
614        """
615        self.bytebuffer = ""
616        self.charbuffer = u""
617        self.linebuffer = None
618
619    def seek(self, offset, whence=0):
620        """ Set the input stream's current position.
621
622            Resets the codec buffers used for keeping state.
623        """
624        self.stream.seek(offset, whence)
625        self.reset()
626
627    def next(self):
628
629        """ Return the next decoded line from the input stream."""
630        line = self.readline()
631        if line:
632            return line
633        raise StopIteration
634
635    def __iter__(self):
636        return self
637
638    def __getattr__(self, name,
639                    getattr=getattr):
640
641        """ Inherit all other methods from the underlying stream.
642        """
643        return getattr(self.stream, name)
644
645    def __enter__(self):
646        return self
647
648    def __exit__(self, type, value, tb):
649        self.stream.close()
650
651###
652
653class StreamReaderWriter:
654
655    """ StreamReaderWriter instances allow wrapping streams which
656        work in both read and write modes.
657
658        The design is such that one can use the factory functions
659        returned by the codec.lookup() function to construct the
660        instance.
661
662    """
663    # Optional attributes set by the file wrappers below
664    encoding = 'unknown'
665
666    def __init__(self, stream, Reader, Writer, errors='strict'):
667
668        """ Creates a StreamReaderWriter instance.
669
670            stream must be a Stream-like object.
671
672            Reader, Writer must be factory functions or classes
673            providing the StreamReader, StreamWriter interface resp.
674
675            Error handling is done in the same way as defined for the
676            StreamWriter/Readers.
677
678        """
679        self.stream = stream
680        self.reader = Reader(stream, errors)
681        self.writer = Writer(stream, errors)
682        self.errors = errors
683
684    def read(self, size=-1):
685
686        return self.reader.read(size)
687
688    def readline(self, size=None):
689
690        return self.reader.readline(size)
691
692    def readlines(self, sizehint=None):
693
694        return self.reader.readlines(sizehint)
695
696    def next(self):
697
698        """ Return the next decoded line from the input stream."""
699        return self.reader.next()
700
701    def __iter__(self):
702        return self
703
704    def write(self, data):
705
706        return self.writer.write(data)
707
708    def writelines(self, list):
709
710        return self.writer.writelines(list)
711
712    def reset(self):
713
714        self.reader.reset()
715        self.writer.reset()
716
717    def seek(self, offset, whence=0):
718        self.stream.seek(offset, whence)
719        self.reader.reset()
720        if whence == 0 and offset == 0:
721            self.writer.reset()
722
723    def __getattr__(self, name,
724                    getattr=getattr):
725
726        """ Inherit all other methods from the underlying stream.
727        """
728        return getattr(self.stream, name)
729
730    # these are needed to make "with codecs.open(...)" work properly
731
732    def __enter__(self):
733        return self
734
735    def __exit__(self, type, value, tb):
736        self.stream.close()
737
738###
739
740class StreamRecoder:
741
742    """ StreamRecoder instances provide a frontend - backend
743        view of encoding data.
744
745        They use the complete set of APIs returned by the
746        codecs.lookup() function to implement their task.
747
748        Data written to the stream is first decoded into an
749        intermediate format (which is dependent on the given codec
750        combination) and then written to the stream using an instance
751        of the provided Writer class.
752
753        In the other direction, data is read from the stream using a
754        Reader instance and then return encoded data to the caller.
755
756    """
757    # Optional attributes set by the file wrappers below
758    data_encoding = 'unknown'
759    file_encoding = 'unknown'
760
761    def __init__(self, stream, encode, decode, Reader, Writer,
762                 errors='strict'):
763
764        """ Creates a StreamRecoder instance which implements a two-way
765            conversion: encode and decode work on the frontend (the
766            input to .read() and output of .write()) while
767            Reader and Writer work on the backend (reading and
768            writing to the stream).
769
770            You can use these objects to do transparent direct
771            recodings from e.g. latin-1 to utf-8 and back.
772
773            stream must be a file-like object.
774
775            encode, decode must adhere to the Codec interface, Reader,
776            Writer must be factory functions or classes providing the
777            StreamReader, StreamWriter interface resp.
778
779            encode and decode are needed for the frontend translation,
780            Reader and Writer for the backend translation. Unicode is
781            used as intermediate encoding.
782
783            Error handling is done in the same way as defined for the
784            StreamWriter/Readers.
785
786        """
787        self.stream = stream
788        self.encode = encode
789        self.decode = decode
790        self.reader = Reader(stream, errors)
791        self.writer = Writer(stream, errors)
792        self.errors = errors
793
794    def read(self, size=-1):
795
796        data = self.reader.read(size)
797        data, bytesencoded = self.encode(data, self.errors)
798        return data
799
800    def readline(self, size=None):
801
802        if size is None:
803            data = self.reader.readline()
804        else:
805            data = self.reader.readline(size)
806        data, bytesencoded = self.encode(data, self.errors)
807        return data
808
809    def readlines(self, sizehint=None):
810
811        data = self.reader.read()
812        data, bytesencoded = self.encode(data, self.errors)
813        return data.splitlines(1)
814
815    def next(self):
816
817        """ Return the next decoded line from the input stream."""
818        data = self.reader.next()
819        data, bytesencoded = self.encode(data, self.errors)
820        return data
821
822    def __iter__(self):
823        return self
824
825    def write(self, data):
826
827        data, bytesdecoded = self.decode(data, self.errors)
828        return self.writer.write(data)
829
830    def writelines(self, list):
831
832        data = ''.join(list)
833        data, bytesdecoded = self.decode(data, self.errors)
834        return self.writer.write(data)
835
836    def reset(self):
837
838        self.reader.reset()
839        self.writer.reset()
840
841    def __getattr__(self, name,
842                    getattr=getattr):
843
844        """ Inherit all other methods from the underlying stream.
845        """
846        return getattr(self.stream, name)
847
848    def __enter__(self):
849        return self
850
851    def __exit__(self, type, value, tb):
852        self.stream.close()
853
854### Shortcuts
855
856def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):
857
858    """ Open an encoded file using the given mode and return
859        a wrapped version providing transparent encoding/decoding.
860
861        Note: The wrapped version will only accept the object format
862        defined by the codecs, i.e. Unicode objects for most builtin
863        codecs. Output is also codec dependent and will usually be
864        Unicode as well.
865
866        Files are always opened in binary mode, even if no binary mode
867        was specified. This is done to avoid data loss due to encodings
868        using 8-bit values. The default file mode is 'rb' meaning to
869        open the file in binary read mode.
870
871        encoding specifies the encoding which is to be used for the
872        file.
873
874        errors may be given to define the error handling. It defaults
875        to 'strict' which causes ValueErrors to be raised in case an
876        encoding error occurs.
877
878        buffering has the same meaning as for the builtin open() API.
879        It defaults to line buffered.
880
881        The returned wrapped file object provides an extra attribute
882        .encoding which allows querying the used encoding. This
883        attribute is only available if an encoding was specified as
884        parameter.
885
886    """
887    if encoding is not None:
888        if 'U' in mode:
889            # No automatic conversion of '\n' is done on reading and writing
890            mode = mode.strip().replace('U', '')
891            if mode[:1] not in set('rwa'):
892                mode = 'r' + mode
893        if 'b' not in mode:
894            # Force opening of the file in binary mode
895            mode = mode + 'b'
896    file = __builtin__.open(filename, mode, buffering)
897    if encoding is None:
898        return file
899    info = lookup(encoding)
900    srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)
901    # Add attributes to simplify introspection
902    srw.encoding = encoding
903    return srw
904
905def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):
906
907    """ Return a wrapped version of file which provides transparent
908        encoding translation.
909
910        Strings written to the wrapped file are interpreted according
911        to the given data_encoding and then written to the original
912        file as string using file_encoding. The intermediate encoding
913        will usually be Unicode but depends on the specified codecs.
914
915        Strings are read from the file using file_encoding and then
916        passed back to the caller as string using data_encoding.
917
918        If file_encoding is not given, it defaults to data_encoding.
919
920        errors may be given to define the error handling. It defaults
921        to 'strict' which causes ValueErrors to be raised in case an
922        encoding error occurs.
923
924        The returned wrapped file object provides two extra attributes
925        .data_encoding and .file_encoding which reflect the given
926        parameters of the same name. The attributes can be used for
927        introspection by Python programs.
928
929    """
930    if file_encoding is None:
931        file_encoding = data_encoding
932    data_info = lookup(data_encoding)
933    file_info = lookup(file_encoding)
934    sr = StreamRecoder(file, data_info.encode, data_info.decode,
935                       file_info.streamreader, file_info.streamwriter, errors)
936    # Add attributes to simplify introspection
937    sr.data_encoding = data_encoding
938    sr.file_encoding = file_encoding
939    return sr
940
941### Helpers for codec lookup
942
943def getencoder(encoding):
944
945    """ Lookup up the codec for the given encoding and return
946        its encoder function.
947
948        Raises a LookupError in case the encoding cannot be found.
949
950    """
951    return lookup(encoding).encode
952
953def getdecoder(encoding):
954
955    """ Lookup up the codec for the given encoding and return
956        its decoder function.
957
958        Raises a LookupError in case the encoding cannot be found.
959
960    """
961    return lookup(encoding).decode
962
963def getincrementalencoder(encoding):
964
965    """ Lookup up the codec for the given encoding and return
966        its IncrementalEncoder class or factory function.
967
968        Raises a LookupError in case the encoding cannot be found
969        or the codecs doesn't provide an incremental encoder.
970
971    """
972    encoder = lookup(encoding).incrementalencoder
973    if encoder is None:
974        raise LookupError(encoding)
975    return encoder
976
977def getincrementaldecoder(encoding):
978
979    """ Lookup up the codec for the given encoding and return
980        its IncrementalDecoder class or factory function.
981
982        Raises a LookupError in case the encoding cannot be found
983        or the codecs doesn't provide an incremental decoder.
984
985    """
986    decoder = lookup(encoding).incrementaldecoder
987    if decoder is None:
988        raise LookupError(encoding)
989    return decoder
990
991def getreader(encoding):
992
993    """ Lookup up the codec for the given encoding and return
994        its StreamReader class or factory function.
995
996        Raises a LookupError in case the encoding cannot be found.
997
998    """
999    return lookup(encoding).streamreader
1000
1001def getwriter(encoding):
1002
1003    """ Lookup up the codec for the given encoding and return
1004        its StreamWriter class or factory function.
1005
1006        Raises a LookupError in case the encoding cannot be found.
1007
1008    """
1009    return lookup(encoding).streamwriter
1010
1011def iterencode(iterator, encoding, errors='strict', **kwargs):
1012    """
1013    Encoding iterator.
1014
1015    Encodes the input strings from the iterator using an IncrementalEncoder.
1016
1017    errors and kwargs are passed through to the IncrementalEncoder
1018    constructor.
1019    """
1020    encoder = getincrementalencoder(encoding)(errors, **kwargs)
1021    for input in iterator:
1022        output = encoder.encode(input)
1023        if output:
1024            yield output
1025    output = encoder.encode("", True)
1026    if output:
1027        yield output
1028
1029def iterdecode(iterator, encoding, errors='strict', **kwargs):
1030    """
1031    Decoding iterator.
1032
1033    Decodes the input strings from the iterator using an IncrementalDecoder.
1034
1035    errors and kwargs are passed through to the IncrementalDecoder
1036    constructor.
1037    """
1038    decoder = getincrementaldecoder(encoding)(errors, **kwargs)
1039    for input in iterator:
1040        output = decoder.decode(input)
1041        if output:
1042            yield output
1043    output = decoder.decode("", True)
1044    if output:
1045        yield output
1046
1047### Helpers for charmap-based codecs
1048
1049def make_identity_dict(rng):
1050
1051    """ make_identity_dict(rng) -> dict
1052
1053        Return a dictionary where elements of the rng sequence are
1054        mapped to themselves.
1055
1056    """
1057    res = {}
1058    for i in rng:
1059        res[i]=i
1060    return res
1061
1062def make_encoding_map(decoding_map):
1063
1064    """ Creates an encoding map from a decoding map.
1065
1066        If a target mapping in the decoding map occurs multiple
1067        times, then that target is mapped to None (undefined mapping),
1068        causing an exception when encountered by the charmap codec
1069        during translation.
1070
1071        One example where this happens is cp875.py which decodes
1072        multiple character to \\u001a.
1073
1074    """
1075    m = {}
1076    for k,v in decoding_map.items():
1077        if not v in m:
1078            m[v] = k
1079        else:
1080            m[v] = None
1081    return m
1082
1083### error handlers
1084
1085try:
1086    strict_errors = lookup_error("strict")
1087    ignore_errors = lookup_error("ignore")
1088    replace_errors = lookup_error("replace")
1089    xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")
1090    backslashreplace_errors = lookup_error("backslashreplace")
1091except LookupError:
1092    # In --disable-unicode builds, these error handler are missing
1093    strict_errors = None
1094    ignore_errors = None
1095    replace_errors = None
1096    xmlcharrefreplace_errors = None
1097    backslashreplace_errors = None
1098
1099# Tell modulefinder that using codecs probably needs the encodings
1100# package
1101_false = 0
1102if _false:
1103    import encodings
1104
1105### Tests
1106
1107if __name__ == '__main__':
1108
1109    # Make stdout translate Latin-1 output into UTF-8 output
1110    sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')
1111
1112    # Have stdin translate Latin-1 input into UTF-8 input
1113    sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')
1114