1from __future__ import ( 2 print_function, division, absolute_import, unicode_literals) 3from fontTools.misc.py23 import * 4 5import re 6from bisect import bisect_right 7 8try: 9 # use unicodedata backport compatible with python2: 10 # https://github.com/mikekap/unicodedata2 11 from unicodedata2 import * 12except ImportError: # pragma: no cover 13 # fall back to built-in unicodedata (possibly outdated) 14 from unicodedata import * 15 16from . import Blocks, Scripts, ScriptExtensions, OTTags 17 18 19__all__ = [tostr(s) for s in ( 20 # names from built-in unicodedata module 21 "lookup", 22 "name", 23 "decimal", 24 "digit", 25 "numeric", 26 "category", 27 "bidirectional", 28 "combining", 29 "east_asian_width", 30 "mirrored", 31 "decomposition", 32 "normalize", 33 "unidata_version", 34 "ucd_3_2_0", 35 # additonal functions 36 "block", 37 "script", 38 "script_extension", 39 "script_name", 40 "script_code", 41 "script_horizontal_direction", 42 "ot_tags_from_script", 43 "ot_tag_to_script", 44)] 45 46 47def script(char): 48 """ Return the four-letter script code assigned to the Unicode character 49 'char' as string. 50 51 >>> script("a") 52 'Latn' 53 >>> script(",") 54 'Zyyy' 55 >>> script(unichr(0x10FFFF)) 56 'Zzzz' 57 """ 58 code = byteord(char) 59 # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which 60 # comes after (to the right of) any existing entries of x in a, and it 61 # partitions array a into two halves so that, for the left side 62 # all(val <= x for val in a[lo:i]), and for the right side 63 # all(val > x for val in a[i:hi]). 64 # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting 65 # breakpoints); we want to use `bisect_right` to look up the range that 66 # contains the given codepoint: i.e. whose start is less than or equal 67 # to the codepoint. Thus, we subtract -1 from the index returned. 68 i = bisect_right(Scripts.RANGES, code) 69 return Scripts.VALUES[i-1] 70 71 72def script_extension(char): 73 """ Return the script extension property assigned to the Unicode character 74 'char' as a set of string. 75 76 >>> script_extension("a") == {'Latn'} 77 True 78 >>> script_extension(unichr(0x060C)) == {'Arab', 'Rohg', 'Syrc', 'Thaa'} 79 True 80 >>> script_extension(unichr(0x10FFFF)) == {'Zzzz'} 81 True 82 """ 83 code = byteord(char) 84 i = bisect_right(ScriptExtensions.RANGES, code) 85 value = ScriptExtensions.VALUES[i-1] 86 if value is None: 87 # code points not explicitly listed for Script Extensions 88 # have as their value the corresponding Script property value 89 return {script(char)} 90 return value 91 92 93def script_name(code, default=KeyError): 94 """ Return the long, human-readable script name given a four-letter 95 Unicode script code. 96 97 If no matching name is found, a KeyError is raised by default. 98 99 You can use the 'default' argument to return a fallback value (e.g. 100 'Unknown' or None) instead of throwing an error. 101 """ 102 try: 103 return str(Scripts.NAMES[code].replace("_", " ")) 104 except KeyError: 105 if isinstance(default, type) and issubclass(default, KeyError): 106 raise 107 return default 108 109 110_normalize_re = re.compile(r"[-_ ]+") 111 112 113def _normalize_property_name(string): 114 """Remove case, strip space, '-' and '_' for loose matching.""" 115 return _normalize_re.sub("", string).lower() 116 117 118_SCRIPT_CODES = {_normalize_property_name(v): k 119 for k, v in Scripts.NAMES.items()} 120 121 122def script_code(script_name, default=KeyError): 123 """Returns the four-letter Unicode script code from its long name 124 125 If no matching script code is found, a KeyError is raised by default. 126 127 You can use the 'default' argument to return a fallback string (e.g. 128 'Zzzz' or None) instead of throwing an error. 129 """ 130 normalized_name = _normalize_property_name(script_name) 131 try: 132 return _SCRIPT_CODES[normalized_name] 133 except KeyError: 134 if isinstance(default, type) and issubclass(default, KeyError): 135 raise 136 return default 137 138 139# The data on script direction is taken from harfbuzz's "hb-common.cc": 140# https://goo.gl/X5FDXC 141# It matches the CLDR "scriptMetadata.txt as of January 2018: 142# http://unicode.org/repos/cldr/trunk/common/properties/scriptMetadata.txt 143RTL_SCRIPTS = { 144 # Unicode-1.1 additions 145 'Arab', # Arabic 146 'Hebr', # Hebrew 147 148 # Unicode-3.0 additions 149 'Syrc', # Syriac 150 'Thaa', # Thaana 151 152 # Unicode-4.0 additions 153 'Cprt', # Cypriot 154 155 # Unicode-4.1 additions 156 'Khar', # Kharoshthi 157 158 # Unicode-5.0 additions 159 'Phnx', # Phoenician 160 'Nkoo', # Nko 161 162 # Unicode-5.1 additions 163 'Lydi', # Lydian 164 165 # Unicode-5.2 additions 166 'Avst', # Avestan 167 'Armi', # Imperial Aramaic 168 'Phli', # Inscriptional Pahlavi 169 'Prti', # Inscriptional Parthian 170 'Sarb', # Old South Arabian 171 'Orkh', # Old Turkic 172 'Samr', # Samaritan 173 174 # Unicode-6.0 additions 175 'Mand', # Mandaic 176 177 # Unicode-6.1 additions 178 'Merc', # Meroitic Cursive 179 'Mero', # Meroitic Hieroglyphs 180 181 # Unicode-7.0 additions 182 'Mani', # Manichaean 183 'Mend', # Mende Kikakui 184 'Nbat', # Nabataean 185 'Narb', # Old North Arabian 186 'Palm', # Palmyrene 187 'Phlp', # Psalter Pahlavi 188 189 # Unicode-8.0 additions 190 'Hatr', # Hatran 191 'Hung', # Old Hungarian 192 193 # Unicode-9.0 additions 194 'Adlm', # Adlam 195} 196 197def script_horizontal_direction(script_code, default=KeyError): 198 """ Return "RTL" for scripts that contain right-to-left characters 199 according to the Bidi_Class property. Otherwise return "LTR". 200 """ 201 if script_code not in Scripts.NAMES: 202 if isinstance(default, type) and issubclass(default, KeyError): 203 raise default(script_code) 204 return default 205 return str("RTL") if script_code in RTL_SCRIPTS else str("LTR") 206 207 208def block(char): 209 """ Return the block property assigned to the Unicode character 'char' 210 as a string. 211 212 >>> block("a") 213 'Basic Latin' 214 >>> block(unichr(0x060C)) 215 'Arabic' 216 >>> block(unichr(0xEFFFF)) 217 'No_Block' 218 """ 219 code = byteord(char) 220 i = bisect_right(Blocks.RANGES, code) 221 return Blocks.VALUES[i-1] 222 223 224def ot_tags_from_script(script_code): 225 """ Return a list of OpenType script tags associated with a given 226 Unicode script code. 227 Return ['DFLT'] script tag for invalid/unknown script codes. 228 """ 229 if script_code not in Scripts.NAMES: 230 return [OTTags.DEFAULT_SCRIPT] 231 232 script_tags = [ 233 OTTags.SCRIPT_EXCEPTIONS.get( 234 script_code, 235 script_code[0].lower() + script_code[1:] 236 ) 237 ] 238 if script_code in OTTags.NEW_SCRIPT_TAGS: 239 script_tags.extend(OTTags.NEW_SCRIPT_TAGS[script_code]) 240 script_tags.reverse() # last in, first out 241 242 return script_tags 243 244 245def ot_tag_to_script(tag): 246 """ Return the Unicode script code for the given OpenType script tag, or 247 None for "DFLT" tag or if there is no Unicode script associated with it. 248 Raises ValueError if the tag is invalid. 249 """ 250 tag = tostr(tag).strip() 251 if not tag or " " in tag or len(tag) > 4: 252 raise ValueError("invalid OpenType tag: %r" % tag) 253 254 while len(tag) != 4: 255 tag += str(" ") # pad with spaces 256 257 if tag == OTTags.DEFAULT_SCRIPT: 258 # it's unclear which Unicode script the "DFLT" OpenType tag maps to, 259 # so here we return None 260 return None 261 262 if tag in OTTags.NEW_SCRIPT_TAGS_REVERSED: 263 return OTTags.NEW_SCRIPT_TAGS_REVERSED[tag] 264 265 # This side of the conversion is fully algorithmic 266 267 # Any spaces at the end of the tag are replaced by repeating the last 268 # letter. Eg 'nko ' -> 'Nkoo'. 269 # Change first char to uppercase 270 script_code = tag[0].upper() + tag[1] 271 for i in range(2, 4): 272 script_code += (script_code[i-1] if tag[i] == " " else tag[i]) 273 274 if script_code not in Scripts.NAMES: 275 return None 276 return script_code 277