"""Extend the Python codecs module with a few encodings that are used in OpenType (name table) but missing from Python. See https://github.com/fonttools/fonttools/issues/236 for details.""" from __future__ import print_function, division, absolute_import from fontTools.misc.py23 import * import codecs import encodings class ExtendCodec(codecs.Codec): def __init__(self, name, base_encoding, mapping): self.name = name self.base_encoding = base_encoding self.mapping = mapping self.reverse = {v:k for k,v in mapping.items()} self.max_len = max(len(v) for v in mapping.values()) self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode) codecs.register_error(name, self.error) def encode(self, input, errors='strict'): assert errors == 'strict' #return codecs.encode(input, self.base_encoding, self.name), len(input) # The above line could totally be all we needed, relying on the error # handling to replace the unencodable Unicode characters with our extended # byte sequences. # # However, there seems to be a design bug in Python (probably intentional): # the error handler for encoding is supposed to return a **Unicode** character, # that then needs to be encodable itself... Ugh. # # So we implement what codecs.encode() should have been doing: which is expect # error handler to return bytes() to be added to the output. # # This seems to have been fixed in Python 3.3. We should try using that and # use fallback only if that failed. # https://docs.python.org/3.3/library/codecs.html#codecs.register_error length = len(input) out = b'' while input: try: part = codecs.encode(input, self.base_encoding) out += part input = '' # All converted except UnicodeEncodeError as e: # Convert the correct part out += codecs.encode(input[:e.start], self.base_encoding) replacement, pos = self.error(e) out += replacement input = input[pos:] return out, length def decode(self, input, errors='strict'): assert errors == 'strict' return codecs.decode(input, self.base_encoding, self.name), len(input) def error(self, e): if isinstance(e, UnicodeDecodeError): for end in range(e.start + 1, e.end + 1): s = e.object[e.start:end] if s in self.mapping: return self.mapping[s], end elif isinstance(e, UnicodeEncodeError): for end in range(e.start + 1, e.start + self.max_len + 1): s = e.object[e.start:end] if s in self.reverse: return self.reverse[s], end e.encoding = self.name raise e _extended_encodings = { "x_mac_japanese_ttx": ("shift_jis", { b"\xFC": unichr(0x007C), b"\x7E": unichr(0x007E), b"\x80": unichr(0x005C), b"\xA0": unichr(0x00A0), b"\xFD": unichr(0x00A9), b"\xFE": unichr(0x2122), b"\xFF": unichr(0x2026), }), "x_mac_trad_chinese_ttx": ("big5", { b"\x80": unichr(0x005C), b"\xA0": unichr(0x00A0), b"\xFD": unichr(0x00A9), b"\xFE": unichr(0x2122), b"\xFF": unichr(0x2026), }), "x_mac_korean_ttx": ("euc_kr", { b"\x80": unichr(0x00A0), b"\x81": unichr(0x20A9), b"\x82": unichr(0x2014), b"\x83": unichr(0x00A9), b"\xFE": unichr(0x2122), b"\xFF": unichr(0x2026), }), "x_mac_simp_chinese_ttx": ("gb2312", { b"\x80": unichr(0x00FC), b"\xA0": unichr(0x00A0), b"\xFD": unichr(0x00A9), b"\xFE": unichr(0x2122), b"\xFF": unichr(0x2026), }), } _cache = {} def search_function(name): name = encodings.normalize_encoding(name) # Rather undocumented... if name in _extended_encodings: if name not in _cache: base_encoding, mapping = _extended_encodings[name] assert(name[-4:] == "_ttx") # Python 2 didn't have any of the encodings that we are implementing # in this file. Python 3 added aliases for the East Asian ones, mapping # them "temporarily" to the same base encoding as us, with a comment # suggesting that full implementation will appear some time later. # As such, try the Python version of the x_mac_... first, if that is found, # use *that* as our base encoding. This would make our encoding upgrade # to the full encoding when and if Python finally implements that. # http://bugs.python.org/issue24041 base_encodings = [name[:-4], base_encoding] for base_encoding in base_encodings: try: codecs.lookup(base_encoding) except LookupError: continue _cache[name] = ExtendCodec(name, base_encoding, mapping) break return _cache[name].info return None codecs.register(search_function)