• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1""" Python 'utf-8-sig' Codec
2This work similar to UTF-8 with the following changes:
3
4* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the
5  first three bytes.
6
7* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these
8  bytes will be skipped.
9"""
10import codecs
11
12### Codec APIs
13
14def encode(input, errors='strict'):
15    return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0],
16            len(input))
17
18def decode(input, errors='strict'):
19    prefix = 0
20    if input[:3] == codecs.BOM_UTF8:
21        input = input[3:]
22        prefix = 3
23    (output, consumed) = codecs.utf_8_decode(input, errors, True)
24    return (output, consumed+prefix)
25
26class IncrementalEncoder(codecs.IncrementalEncoder):
27    def __init__(self, errors='strict'):
28        codecs.IncrementalEncoder.__init__(self, errors)
29        self.first = 1
30
31    def encode(self, input, final=False):
32        if self.first:
33            self.first = 0
34            return codecs.BOM_UTF8 + \
35                   codecs.utf_8_encode(input, self.errors)[0]
36        else:
37            return codecs.utf_8_encode(input, self.errors)[0]
38
39    def reset(self):
40        codecs.IncrementalEncoder.reset(self)
41        self.first = 1
42
43    def getstate(self):
44        return self.first
45
46    def setstate(self, state):
47        self.first = state
48
49class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
50    def __init__(self, errors='strict'):
51        codecs.BufferedIncrementalDecoder.__init__(self, errors)
52        self.first = 1
53
54    def _buffer_decode(self, input, errors, final):
55        if self.first:
56            if len(input) < 3:
57                if codecs.BOM_UTF8.startswith(input):
58                    # not enough data to decide if this really is a BOM
59                    # => try again on the next call
60                    return ("", 0)
61                else:
62                    self.first = 0
63            else:
64                self.first = 0
65                if input[:3] == codecs.BOM_UTF8:
66                    (output, consumed) = \
67                       codecs.utf_8_decode(input[3:], errors, final)
68                    return (output, consumed+3)
69        return codecs.utf_8_decode(input, errors, final)
70
71    def reset(self):
72        codecs.BufferedIncrementalDecoder.reset(self)
73        self.first = 1
74
75    def getstate(self):
76        state = codecs.BufferedIncrementalDecoder.getstate(self)
77        # state[1] must be 0 here, as it isn't passed along to the caller
78        return (state[0], self.first)
79
80    def setstate(self, state):
81        # state[1] will be ignored by BufferedIncrementalDecoder.setstate()
82        codecs.BufferedIncrementalDecoder.setstate(self, state)
83        self.first = state[1]
84
85class StreamWriter(codecs.StreamWriter):
86    def reset(self):
87        codecs.StreamWriter.reset(self)
88        try:
89            del self.encode
90        except AttributeError:
91            pass
92
93    def encode(self, input, errors='strict'):
94        self.encode = codecs.utf_8_encode
95        return encode(input, errors)
96
97class StreamReader(codecs.StreamReader):
98    def reset(self):
99        codecs.StreamReader.reset(self)
100        try:
101            del self.decode
102        except AttributeError:
103            pass
104
105    def decode(self, input, errors='strict'):
106        if len(input) < 3:
107            if codecs.BOM_UTF8.startswith(input):
108                # not enough data to decide if this is a BOM
109                # => try again on the next call
110                return ("", 0)
111        elif input[:3] == codecs.BOM_UTF8:
112            self.decode = codecs.utf_8_decode
113            (output, consumed) = codecs.utf_8_decode(input[3:],errors)
114            return (output, consumed+3)
115        # (else) no BOM present
116        self.decode = codecs.utf_8_decode
117        return codecs.utf_8_decode(input, errors)
118
119### encodings module API
120
121def getregentry():
122    return codecs.CodecInfo(
123        name='utf-8-sig',
124        encode=encode,
125        decode=decode,
126        incrementalencoder=IncrementalEncoder,
127        incrementaldecoder=IncrementalDecoder,
128        streamreader=StreamReader,
129        streamwriter=StreamWriter,
130    )
131