1from fontTools.misc.textTools import byteord, tostr 2 3import re 4from bisect import bisect_right 5 6try: 7 # use unicodedata backport compatible with python2: 8 # https://github.com/fonttools/unicodedata2 9 from unicodedata2 import * 10except ImportError: # pragma: no cover 11 # fall back to built-in unicodedata (possibly outdated) 12 from unicodedata import * 13 14from . import Blocks, Scripts, ScriptExtensions, OTTags 15 16 17__all__ = [tostr(s) for s in ( 18 # names from built-in unicodedata module 19 "lookup", 20 "name", 21 "decimal", 22 "digit", 23 "numeric", 24 "category", 25 "bidirectional", 26 "combining", 27 "east_asian_width", 28 "mirrored", 29 "decomposition", 30 "normalize", 31 "unidata_version", 32 "ucd_3_2_0", 33 # additonal functions 34 "block", 35 "script", 36 "script_extension", 37 "script_name", 38 "script_code", 39 "script_horizontal_direction", 40 "ot_tags_from_script", 41 "ot_tag_to_script", 42)] 43 44 45def script(char): 46 """ Return the four-letter script code assigned to the Unicode character 47 'char' as string. 48 49 >>> script("a") 50 'Latn' 51 >>> script(",") 52 'Zyyy' 53 >>> script(chr(0x10FFFF)) 54 'Zzzz' 55 """ 56 code = byteord(char) 57 # 'bisect_right(a, x, lo=0, hi=len(a))' returns an insertion point which 58 # comes after (to the right of) any existing entries of x in a, and it 59 # partitions array a into two halves so that, for the left side 60 # all(val <= x for val in a[lo:i]), and for the right side 61 # all(val > x for val in a[i:hi]). 62 # Our 'SCRIPT_RANGES' is a sorted list of ranges (only their starting 63 # breakpoints); we want to use `bisect_right` to look up the range that 64 # contains the given codepoint: i.e. whose start is less than or equal 65 # to the codepoint. Thus, we subtract -1 from the index returned. 66 i = bisect_right(Scripts.RANGES, code) 67 return Scripts.VALUES[i-1] 68 69 70def script_extension(char): 71 """ Return the script extension property assigned to the Unicode character 72 'char' as a set of string. 73 74 >>> script_extension("a") == {'Latn'} 75 True 76 >>> script_extension(chr(0x060C)) == {'Rohg', 'Syrc', 'Yezi', 'Arab', 'Thaa', 'Nkoo'} 77 True 78 >>> script_extension(chr(0x10FFFF)) == {'Zzzz'} 79 True 80 """ 81 code = byteord(char) 82 i = bisect_right(ScriptExtensions.RANGES, code) 83 value = ScriptExtensions.VALUES[i-1] 84 if value is None: 85 # code points not explicitly listed for Script Extensions 86 # have as their value the corresponding Script property value 87 return {script(char)} 88 return value 89 90 91def script_name(code, default=KeyError): 92 """ Return the long, human-readable script name given a four-letter 93 Unicode script code. 94 95 If no matching name is found, a KeyError is raised by default. 96 97 You can use the 'default' argument to return a fallback value (e.g. 98 'Unknown' or None) instead of throwing an error. 99 """ 100 try: 101 return str(Scripts.NAMES[code].replace("_", " ")) 102 except KeyError: 103 if isinstance(default, type) and issubclass(default, KeyError): 104 raise 105 return default 106 107 108_normalize_re = re.compile(r"[-_ ]+") 109 110 111def _normalize_property_name(string): 112 """Remove case, strip space, '-' and '_' for loose matching.""" 113 return _normalize_re.sub("", string).lower() 114 115 116_SCRIPT_CODES = {_normalize_property_name(v): k 117 for k, v in Scripts.NAMES.items()} 118 119 120def script_code(script_name, default=KeyError): 121 """Returns the four-letter Unicode script code from its long name 122 123 If no matching script code is found, a KeyError is raised by default. 124 125 You can use the 'default' argument to return a fallback string (e.g. 126 'Zzzz' or None) instead of throwing an error. 127 """ 128 normalized_name = _normalize_property_name(script_name) 129 try: 130 return _SCRIPT_CODES[normalized_name] 131 except KeyError: 132 if isinstance(default, type) and issubclass(default, KeyError): 133 raise 134 return default 135 136 137# The data on script direction is taken from Harfbuzz source code: 138# https://github.com/harfbuzz/harfbuzz/blob/3.2.0/src/hb-common.cc#L514-L613 139# This in turn references the following "Script_Metadata" document: 140# https://docs.google.com/spreadsheets/d/1Y90M0Ie3MUJ6UVCRDOypOtijlMDLNNyyLk36T6iMu0o 141RTL_SCRIPTS = { 142 # Unicode-1.1 additions 143 'Arab', # Arabic 144 'Hebr', # Hebrew 145 146 # Unicode-3.0 additions 147 'Syrc', # Syriac 148 'Thaa', # Thaana 149 150 # Unicode-4.0 additions 151 'Cprt', # Cypriot 152 153 # Unicode-4.1 additions 154 'Khar', # Kharoshthi 155 156 # Unicode-5.0 additions 157 'Phnx', # Phoenician 158 'Nkoo', # Nko 159 160 # Unicode-5.1 additions 161 'Lydi', # Lydian 162 163 # Unicode-5.2 additions 164 'Avst', # Avestan 165 'Armi', # Imperial Aramaic 166 'Phli', # Inscriptional Pahlavi 167 'Prti', # Inscriptional Parthian 168 'Sarb', # Old South Arabian 169 'Orkh', # Old Turkic 170 'Samr', # Samaritan 171 172 # Unicode-6.0 additions 173 'Mand', # Mandaic 174 175 # Unicode-6.1 additions 176 'Merc', # Meroitic Cursive 177 'Mero', # Meroitic Hieroglyphs 178 179 # Unicode-7.0 additions 180 'Mani', # Manichaean 181 'Mend', # Mende Kikakui 182 'Nbat', # Nabataean 183 'Narb', # Old North Arabian 184 'Palm', # Palmyrene 185 'Phlp', # Psalter Pahlavi 186 187 # Unicode-8.0 additions 188 'Hatr', # Hatran 189 'Hung', # Old Hungarian 190 191 # Unicode-9.0 additions 192 'Adlm', # Adlam 193 194 # Unicode-11.0 additions 195 'Rohg', # Hanifi Rohingya 196 'Sogo', # Old Sogdian 197 'Sogd', # Sogdian 198 199 # Unicode-12.0 additions 200 'Elym', # Elymaic 201 202 # Unicode-13.0 additions 203 'Chrs', # Chorasmian 204 'Yezi', # Yezidi 205 206 # Unicode-14.0 additions 207 'Ougr', # Old Uyghur 208} 209 210def script_horizontal_direction(script_code, default=KeyError): 211 """ Return "RTL" for scripts that contain right-to-left characters 212 according to the Bidi_Class property. Otherwise return "LTR". 213 """ 214 if script_code not in Scripts.NAMES: 215 if isinstance(default, type) and issubclass(default, KeyError): 216 raise default(script_code) 217 return default 218 return str("RTL") if script_code in RTL_SCRIPTS else str("LTR") 219 220 221def block(char): 222 """ Return the block property assigned to the Unicode character 'char' 223 as a string. 224 225 >>> block("a") 226 'Basic Latin' 227 >>> block(chr(0x060C)) 228 'Arabic' 229 >>> block(chr(0xEFFFF)) 230 'No_Block' 231 """ 232 code = byteord(char) 233 i = bisect_right(Blocks.RANGES, code) 234 return Blocks.VALUES[i-1] 235 236 237def ot_tags_from_script(script_code): 238 """ Return a list of OpenType script tags associated with a given 239 Unicode script code. 240 Return ['DFLT'] script tag for invalid/unknown script codes. 241 """ 242 if script_code not in Scripts.NAMES: 243 return [OTTags.DEFAULT_SCRIPT] 244 245 script_tags = [ 246 OTTags.SCRIPT_EXCEPTIONS.get( 247 script_code, 248 script_code[0].lower() + script_code[1:] 249 ) 250 ] 251 if script_code in OTTags.NEW_SCRIPT_TAGS: 252 script_tags.extend(OTTags.NEW_SCRIPT_TAGS[script_code]) 253 script_tags.reverse() # last in, first out 254 255 return script_tags 256 257 258def ot_tag_to_script(tag): 259 """ Return the Unicode script code for the given OpenType script tag, or 260 None for "DFLT" tag or if there is no Unicode script associated with it. 261 Raises ValueError if the tag is invalid. 262 """ 263 tag = tostr(tag).strip() 264 if not tag or " " in tag or len(tag) > 4: 265 raise ValueError("invalid OpenType tag: %r" % tag) 266 267 if tag in OTTags.SCRIPT_ALIASES: 268 tag = OTTags.SCRIPT_ALIASES[tag] 269 270 while len(tag) != 4: 271 tag += str(" ") # pad with spaces 272 273 if tag == OTTags.DEFAULT_SCRIPT: 274 # it's unclear which Unicode script the "DFLT" OpenType tag maps to, 275 # so here we return None 276 return None 277 278 if tag in OTTags.NEW_SCRIPT_TAGS_REVERSED: 279 return OTTags.NEW_SCRIPT_TAGS_REVERSED[tag] 280 281 # This side of the conversion is fully algorithmic 282 283 # Any spaces at the end of the tag are replaced by repeating the last 284 # letter. Eg 'nko ' -> 'Nkoo'. 285 # Change first char to uppercase 286 script_code = tag[0].upper() + tag[1] 287 for i in range(2, 4): 288 script_code += (script_code[i-1] if tag[i] == " " else tag[i]) 289 290 if script_code not in Scripts.NAMES: 291 return None 292 return script_code 293