• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from fontTools.misc.textTools import byteord, tostr
2
3import re
4from bisect import bisect_right
5
6try:
7    # use unicodedata backport compatible with python2:
8    # https://github.com/fonttools/unicodedata2
9    from unicodedata2 import *
10except ImportError:  # pragma: no cover
11    # fall back to built-in unicodedata (possibly outdated)
12    from unicodedata import *
13
14from . import Blocks, Scripts, ScriptExtensions, OTTags
15
16
17__all__ = [tostr(s) for s in (
18    # names from built-in unicodedata module
19    "lookup",
20    "name",
21    "decimal",
22    "digit",
23    "numeric",
24    "category",
25    "bidirectional",
26    "combining",
27    "east_asian_width",
28    "mirrored",
29    "decomposition",
30    "normalize",
31    "unidata_version",
32    "ucd_3_2_0",
33    # additonal functions
34    "block",
35    "script",
36    "script_extension",
37    "script_name",
38    "script_code",
39    "script_horizontal_direction",
40    "ot_tags_from_script",
41    "ot_tag_to_script",
42)]
43
44
45def script(char):
46    """ Return the four-letter script code assigned to the Unicode character
47    'char' as string.
48
49    >>> script("a")
50    'Latn'
51    >>> script(",")
52    'Zyyy'
53    >>> script(chr(0x10FFFF))
54    'Zzzz'
55    """
56    code = byteord(char)
57    # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which
58    # comes after (to the right of) any existing entries of x in a, and it
59    # partitions array a into two halves so that, for the left side
60    # all(val <= x for val in a[lo:i]), and for the right side
61    # all(val > x for val in a[i:hi]).
62    # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting
63    # breakpoints); we want to use `bisect_right` to look up the range that
64    # contains the given codepoint: i.e. whose start is less than or equal
65    # to the codepoint. Thus, we subtract -1 from the index returned.
66    i = bisect_right(Scripts.RANGES, code)
67    return Scripts.VALUES[i-1]
68
69
70def script_extension(char):
71    """ Return the script extension property assigned to the Unicode character
72    'char' as a set of string.
73
74    >>> script_extension("a") == {'Latn'}
75    True
76    >>> script_extension(chr(0x060C)) == {'Rohg', 'Syrc', 'Yezi', 'Arab', 'Thaa', 'Nkoo'}
77    True
78    >>> script_extension(chr(0x10FFFF)) == {'Zzzz'}
79    True
80    """
81    code = byteord(char)
82    i = bisect_right(ScriptExtensions.RANGES, code)
83    value = ScriptExtensions.VALUES[i-1]
84    if value is None:
85        # code points not explicitly listed for Script Extensions
86        # have as their value the corresponding Script property value
87        return {script(char)}
88    return value
89
90
91def script_name(code, default=KeyError):
92    """ Return the long, human-readable script name given a four-letter
93    Unicode script code.
94
95    If no matching name is found, a KeyError is raised by default.
96
97    You can use the 'default' argument to return a fallback value (e.g.
98    'Unknown' or None) instead of throwing an error.
99    """
100    try:
101        return str(Scripts.NAMES[code].replace("_", " "))
102    except KeyError:
103        if isinstance(default, type) and issubclass(default, KeyError):
104            raise
105        return default
106
107
108_normalize_re = re.compile(r"[-_ ]+")
109
110
111def _normalize_property_name(string):
112    """Remove case, strip space, '-' and '_' for loose matching."""
113    return _normalize_re.sub("", string).lower()
114
115
116_SCRIPT_CODES = {_normalize_property_name(v): k
117                 for k, v in Scripts.NAMES.items()}
118
119
120def script_code(script_name, default=KeyError):
121    """Returns the four-letter Unicode script code from its long name
122
123    If no matching script code is found, a KeyError is raised by default.
124
125    You can use the 'default' argument to return a fallback string (e.g.
126    'Zzzz' or None) instead of throwing an error.
127    """
128    normalized_name = _normalize_property_name(script_name)
129    try:
130        return _SCRIPT_CODES[normalized_name]
131    except KeyError:
132        if isinstance(default, type) and issubclass(default, KeyError):
133            raise
134        return default
135
136
137# The data on script direction is taken from Harfbuzz source code:
138# https://github.com/harfbuzz/harfbuzz/blob/3.2.0/src/hb-common.cc#L514-L613
139# This in turn references the following "Script_Metadata" document:
140# https://docs.google.com/spreadsheets/d/1Y90M0Ie3MUJ6UVCRDOypOtijlMDLNNyyLk36T6iMu0o
141RTL_SCRIPTS = {
142    # Unicode-1.1 additions
143    'Arab',  # Arabic
144    'Hebr',  # Hebrew
145
146    # Unicode-3.0 additions
147    'Syrc',  # Syriac
148    'Thaa',  # Thaana
149
150    # Unicode-4.0 additions
151    'Cprt',  # Cypriot
152
153    # Unicode-4.1 additions
154    'Khar',  # Kharoshthi
155
156    # Unicode-5.0 additions
157    'Phnx',  # Phoenician
158    'Nkoo',  # Nko
159
160    # Unicode-5.1 additions
161    'Lydi',  # Lydian
162
163    # Unicode-5.2 additions
164    'Avst',  # Avestan
165    'Armi',  # Imperial Aramaic
166    'Phli',  # Inscriptional Pahlavi
167    'Prti',  # Inscriptional Parthian
168    'Sarb',  # Old South Arabian
169    'Orkh',  # Old Turkic
170    'Samr',  # Samaritan
171
172    # Unicode-6.0 additions
173    'Mand',  # Mandaic
174
175    # Unicode-6.1 additions
176    'Merc',  # Meroitic Cursive
177    'Mero',  # Meroitic Hieroglyphs
178
179    # Unicode-7.0 additions
180    'Mani',  # Manichaean
181    'Mend',  # Mende Kikakui
182    'Nbat',  # Nabataean
183    'Narb',  # Old North Arabian
184    'Palm',  # Palmyrene
185    'Phlp',  # Psalter Pahlavi
186
187    # Unicode-8.0 additions
188    'Hatr',  # Hatran
189    'Hung',  # Old Hungarian
190
191    # Unicode-9.0 additions
192    'Adlm',  # Adlam
193
194    # Unicode-11.0 additions
195    'Rohg',  # Hanifi Rohingya
196    'Sogo',  # Old Sogdian
197    'Sogd',  # Sogdian
198
199    # Unicode-12.0 additions
200    'Elym',  # Elymaic
201
202    # Unicode-13.0 additions
203    'Chrs',  # Chorasmian
204    'Yezi',  # Yezidi
205
206    # Unicode-14.0 additions
207    'Ougr',  # Old Uyghur
208}
209
210def script_horizontal_direction(script_code, default=KeyError):
211    """ Return "RTL" for scripts that contain right-to-left characters
212    according to the Bidi_Class property. Otherwise return "LTR".
213    """
214    if script_code not in Scripts.NAMES:
215        if isinstance(default, type) and issubclass(default, KeyError):
216            raise default(script_code)
217        return default
218    return str("RTL") if script_code in RTL_SCRIPTS else str("LTR")
219
220
221def block(char):
222    """ Return the block property assigned to the Unicode character 'char'
223    as a string.
224
225    >>> block("a")
226    'Basic Latin'
227    >>> block(chr(0x060C))
228    'Arabic'
229    >>> block(chr(0xEFFFF))
230    'No_Block'
231    """
232    code = byteord(char)
233    i = bisect_right(Blocks.RANGES, code)
234    return Blocks.VALUES[i-1]
235
236
237def ot_tags_from_script(script_code):
238    """ Return a list of OpenType script tags associated with a given
239    Unicode script code.
240    Return ['DFLT'] script tag for invalid/unknown script codes.
241    """
242    if script_code not in Scripts.NAMES:
243        return [OTTags.DEFAULT_SCRIPT]
244
245    script_tags = [
246        OTTags.SCRIPT_EXCEPTIONS.get(
247            script_code,
248            script_code[0].lower() + script_code[1:]
249        )
250    ]
251    if script_code in OTTags.NEW_SCRIPT_TAGS:
252        script_tags.extend(OTTags.NEW_SCRIPT_TAGS[script_code])
253        script_tags.reverse()  # last in, first out
254
255    return script_tags
256
257
258def ot_tag_to_script(tag):
259    """ Return the Unicode script code for the given OpenType script tag, or
260    None for "DFLT" tag or if there is no Unicode script associated with it.
261    Raises ValueError if the tag is invalid.
262    """
263    tag = tostr(tag).strip()
264    if not tag or " " in tag or len(tag) > 4:
265        raise ValueError("invalid OpenType tag: %r" % tag)
266
267    if tag in OTTags.SCRIPT_ALIASES:
268        tag = OTTags.SCRIPT_ALIASES[tag]
269
270    while len(tag) != 4:
271        tag += str(" ")  # pad with spaces
272
273    if tag == OTTags.DEFAULT_SCRIPT:
274        # it's unclear which Unicode script the "DFLT" OpenType tag maps to,
275        # so here we return None
276        return None
277
278    if tag in OTTags.NEW_SCRIPT_TAGS_REVERSED:
279        return OTTags.NEW_SCRIPT_TAGS_REVERSED[tag]
280
281    # This side of the conversion is fully algorithmic
282
283    # Any spaces at the end of the tag are replaced by repeating the last
284    # letter. Eg 'nko ' -> 'Nkoo'.
285    # Change first char to uppercase
286    script_code = tag[0].upper() + tag[1]
287    for i in range(2, 4):
288        script_code += (script_code[i-1] if tag[i] == " " else tag[i])
289
290    if script_code not in Scripts.NAMES:
291        return None
292    return script_code
293