1""" Python 'utf-8-sig' Codec 2This work similar to UTF-8 with the following changes: 3 4* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the 5 first three bytes. 6 7* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these 8 bytes will be skipped. 9""" 10import codecs 11 12### Codec APIs 13 14def encode(input, errors='strict'): 15 return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], 16 len(input)) 17 18def decode(input, errors='strict'): 19 prefix = 0 20 if input[:3] == codecs.BOM_UTF8: 21 input = input[3:] 22 prefix = 3 23 (output, consumed) = codecs.utf_8_decode(input, errors, True) 24 return (output, consumed+prefix) 25 26class IncrementalEncoder(codecs.IncrementalEncoder): 27 def __init__(self, errors='strict'): 28 codecs.IncrementalEncoder.__init__(self, errors) 29 self.first = 1 30 31 def encode(self, input, final=False): 32 if self.first: 33 self.first = 0 34 return codecs.BOM_UTF8 + \ 35 codecs.utf_8_encode(input, self.errors)[0] 36 else: 37 return codecs.utf_8_encode(input, self.errors)[0] 38 39 def reset(self): 40 codecs.IncrementalEncoder.reset(self) 41 self.first = 1 42 43 def getstate(self): 44 return self.first 45 46 def setstate(self, state): 47 self.first = state 48 49class IncrementalDecoder(codecs.BufferedIncrementalDecoder): 50 def __init__(self, errors='strict'): 51 codecs.BufferedIncrementalDecoder.__init__(self, errors) 52 self.first = 1 53 54 def _buffer_decode(self, input, errors, final): 55 if self.first: 56 if len(input) < 3: 57 if codecs.BOM_UTF8.startswith(input): 58 # not enough data to decide if this really is a BOM 59 # => try again on the next call 60 return ("", 0) 61 else: 62 self.first = 0 63 else: 64 self.first = 0 65 if input[:3] == codecs.BOM_UTF8: 66 (output, consumed) = \ 67 codecs.utf_8_decode(input[3:], errors, final) 68 return (output, consumed+3) 69 return codecs.utf_8_decode(input, errors, final) 70 71 def reset(self): 72 codecs.BufferedIncrementalDecoder.reset(self) 73 self.first = 1 74 75 def getstate(self): 76 state = codecs.BufferedIncrementalDecoder.getstate(self) 77 # state[1] must be 0 here, as it isn't passed along to the caller 78 return (state[0], self.first) 79 80 def setstate(self, state): 81 # state[1] will be ignored by BufferedIncrementalDecoder.setstate() 82 codecs.BufferedIncrementalDecoder.setstate(self, state) 83 self.first = state[1] 84 85class StreamWriter(codecs.StreamWriter): 86 def reset(self): 87 codecs.StreamWriter.reset(self) 88 try: 89 del self.encode 90 except AttributeError: 91 pass 92 93 def encode(self, input, errors='strict'): 94 self.encode = codecs.utf_8_encode 95 return encode(input, errors) 96 97class StreamReader(codecs.StreamReader): 98 def reset(self): 99 codecs.StreamReader.reset(self) 100 try: 101 del self.decode 102 except AttributeError: 103 pass 104 105 def decode(self, input, errors='strict'): 106 if len(input) < 3: 107 if codecs.BOM_UTF8.startswith(input): 108 # not enough data to decide if this is a BOM 109 # => try again on the next call 110 return ("", 0) 111 elif input[:3] == codecs.BOM_UTF8: 112 self.decode = codecs.utf_8_decode 113 (output, consumed) = codecs.utf_8_decode(input[3:],errors) 114 return (output, consumed+3) 115 # (else) no BOM present 116 self.decode = codecs.utf_8_decode 117 return codecs.utf_8_decode(input, errors) 118 119### encodings module API 120 121def getregentry(): 122 return codecs.CodecInfo( 123 name='utf-8-sig', 124 encode=encode, 125 decode=decode, 126 incrementalencoder=IncrementalEncoder, 127 incrementaldecoder=IncrementalDecoder, 128 streamreader=StreamReader, 129 streamwriter=StreamWriter, 130 ) 131