• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Extend the Python codecs module with a few encodings that are used in OpenType (name table)
2but missing from Python.  See https://github.com/fonttools/fonttools/issues/236 for details."""
3
4from __future__ import print_function, division, absolute_import
5from fontTools.misc.py23 import *
6import codecs
7import encodings
8
9class ExtendCodec(codecs.Codec):
10
11	def __init__(self, name, base_encoding, mapping):
12		self.name = name
13		self.base_encoding = base_encoding
14		self.mapping = mapping
15		self.reverse = {v:k for k,v in mapping.items()}
16		self.max_len = max(len(v) for v in mapping.values())
17		self.info = codecs.CodecInfo(name=self.name, encode=self.encode, decode=self.decode)
18		codecs.register_error(name, self.error)
19
20	def encode(self, input, errors='strict'):
21		assert errors == 'strict'
22		#return codecs.encode(input, self.base_encoding, self.name), len(input)
23
24		# The above line could totally be all we needed, relying on the error
25		# handling to replace the unencodable Unicode characters with our extended
26		# byte sequences.
27		#
28		# However, there seems to be a design bug in Python (probably intentional):
29		# the error handler for encoding is supposed to return a **Unicode** character,
30		# that then needs to be encodable itself...  Ugh.
31		#
32		# So we implement what codecs.encode() should have been doing: which is expect
33		# error handler to return bytes() to be added to the output.
34		#
35		# This seems to have been fixed in Python 3.3.  We should try using that and
36		# use fallback only if that failed.
37		# https://docs.python.org/3.3/library/codecs.html#codecs.register_error
38
39		length = len(input)
40		out = b''
41		while input:
42			try:
43				part = codecs.encode(input, self.base_encoding)
44				out += part
45				input = '' # All converted
46			except UnicodeEncodeError as e:
47				# Convert the correct part
48				out += codecs.encode(input[:e.start], self.base_encoding)
49				replacement, pos = self.error(e)
50				out += replacement
51				input = input[pos:]
52		return out, length
53
54	def decode(self, input, errors='strict'):
55		assert errors == 'strict'
56		return codecs.decode(input, self.base_encoding, self.name), len(input)
57
58	def error(self, e):
59		if isinstance(e, UnicodeDecodeError):
60			for end in range(e.start + 1, e.end + 1):
61				s = e.object[e.start:end]
62				if s in self.mapping:
63					return self.mapping[s], end
64		elif isinstance(e, UnicodeEncodeError):
65			for end in range(e.start + 1, e.start + self.max_len + 1):
66				s = e.object[e.start:end]
67				if s in self.reverse:
68					return self.reverse[s], end
69		e.encoding = self.name
70		raise e
71
72
73_extended_encodings = {
74	"x_mac_japanese_ttx": ("shift_jis", {
75					b"\xFC": unichr(0x007C),
76					b"\x7E": unichr(0x007E),
77					b"\x80": unichr(0x005C),
78					b"\xA0": unichr(0x00A0),
79					b"\xFD": unichr(0x00A9),
80					b"\xFE": unichr(0x2122),
81					b"\xFF": unichr(0x2026),
82				}),
83	"x_mac_trad_chinese_ttx": ("big5", {
84					b"\x80": unichr(0x005C),
85					b"\xA0": unichr(0x00A0),
86					b"\xFD": unichr(0x00A9),
87					b"\xFE": unichr(0x2122),
88					b"\xFF": unichr(0x2026),
89				}),
90	"x_mac_korean_ttx": ("euc_kr", {
91					b"\x80": unichr(0x00A0),
92					b"\x81": unichr(0x20A9),
93					b"\x82": unichr(0x2014),
94					b"\x83": unichr(0x00A9),
95					b"\xFE": unichr(0x2122),
96					b"\xFF": unichr(0x2026),
97				}),
98	"x_mac_simp_chinese_ttx": ("gb2312", {
99					b"\x80": unichr(0x00FC),
100					b"\xA0": unichr(0x00A0),
101					b"\xFD": unichr(0x00A9),
102					b"\xFE": unichr(0x2122),
103					b"\xFF": unichr(0x2026),
104				}),
105}
106
107_cache = {}
108
109def search_function(name):
110	name = encodings.normalize_encoding(name) # Rather undocumented...
111	if name in _extended_encodings:
112		if name not in _cache:
113			base_encoding, mapping = _extended_encodings[name]
114			assert(name[-4:] == "_ttx")
115			# Python 2 didn't have any of the encodings that we are implementing
116			# in this file.  Python 3 added aliases for the East Asian ones, mapping
117			# them "temporarily" to the same base encoding as us, with a comment
118			# suggesting that full implementation will appear some time later.
119			# As such, try the Python version of the x_mac_... first, if that is found,
120			# use *that* as our base encoding.  This would make our encoding upgrade
121			# to the full encoding when and if Python finally implements that.
122			# http://bugs.python.org/issue24041
123			base_encodings = [name[:-4], base_encoding]
124			for base_encoding in base_encodings:
125				try:
126					codecs.lookup(base_encoding)
127				except LookupError:
128					continue
129				_cache[name] = ExtendCodec(name, base_encoding, mapping)
130				break
131		return _cache[name].info
132
133	return None
134
135codecs.register(search_function)
136