1# Copyright 2008 The RE2 Authors. All Rights Reserved. 2# Use of this source code is governed by a BSD-style 3# license that can be found in the LICENSE file. 4 5"""Parser for Unicode data files (as distributed by unicode.org).""" 6 7from __future__ import absolute_import 8from __future__ import division 9from __future__ import print_function 10 11import os 12import re 13from six.moves import urllib 14 15# Directory or URL where Unicode tables reside. 16_UNICODE_DIR = "https://www.unicode.org/Public/12.1.0/ucd" 17 18# Largest valid Unicode code value. 19_RUNE_MAX = 0x10FFFF 20 21 22class Error(Exception): 23 """Unicode error base class.""" 24 25 26class InputError(Error): 27 """Unicode input error class. Raised on invalid input.""" 28 29 30def _UInt(s): 31 """Converts string to Unicode code point ('263A' => 0x263a). 32 33 Args: 34 s: string to convert 35 36 Returns: 37 Unicode code point 38 39 Raises: 40 InputError: the string is not a valid Unicode value. 41 """ 42 43 try: 44 v = int(s, 16) 45 except ValueError: 46 v = -1 47 if len(s) < 4 or len(s) > 6 or v < 0 or v > _RUNE_MAX: 48 raise InputError("invalid Unicode value %s" % (s,)) 49 return v 50 51 52def _URange(s): 53 """Converts string to Unicode range. 54 55 '0001..0003' => [1, 2, 3]. 56 '0001' => [1]. 57 58 Args: 59 s: string to convert 60 61 Returns: 62 Unicode range 63 64 Raises: 65 InputError: the string is not a valid Unicode range. 66 """ 67 a = s.split("..") 68 if len(a) == 1: 69 return [_UInt(a[0])] 70 if len(a) == 2: 71 lo = _UInt(a[0]) 72 hi = _UInt(a[1]) 73 if lo < hi: 74 return range(lo, hi + 1) 75 raise InputError("invalid Unicode range %s" % (s,)) 76 77 78def _UStr(v): 79 """Converts Unicode code point to hex string. 80 81 0x263a => '0x263A'. 82 83 Args: 84 v: code point to convert 85 86 Returns: 87 Unicode string 88 89 Raises: 90 InputError: the argument is not a valid Unicode value. 91 """ 92 if v < 0 or v > _RUNE_MAX: 93 raise InputError("invalid Unicode value %s" % (v,)) 94 return "0x%04X" % (v,) 95 96 97def _ParseContinue(s): 98 """Parses a Unicode continuation field. 99 100 These are of the form '<Name, First>' or '<Name, Last>'. 101 Instead of giving an explicit range in a single table entry, 102 some Unicode tables use two entries, one for the first 103 code value in the range and one for the last. 104 The first entry's description is '<Name, First>' instead of 'Name' 105 and the second is '<Name, Last>'. 106 107 '<Name, First>' => ('Name', 'First') 108 '<Name, Last>' => ('Name', 'Last') 109 'Anything else' => ('Anything else', None) 110 111 Args: 112 s: continuation field string 113 114 Returns: 115 pair: name and ('First', 'Last', or None) 116 """ 117 118 match = re.match("<(.*), (First|Last)>", s) 119 if match is not None: 120 return match.groups() 121 return (s, None) 122 123 124def ReadUnicodeTable(filename, nfields, doline): 125 """Generic Unicode table text file reader. 126 127 The reader takes care of stripping out comments and also 128 parsing the two different ways that the Unicode tables specify 129 code ranges (using the .. notation and splitting the range across 130 multiple lines). 131 132 Each non-comment line in the table is expected to have the given 133 number of fields. The first field is known to be the Unicode value 134 and the second field its description. 135 136 The reader calls doline(codes, fields) for each entry in the table. 137 If fn raises an exception, the reader prints that exception, 138 prefixed with the file name and line number, and continues 139 processing the file. When done with the file, the reader re-raises 140 the first exception encountered during the file. 141 142 Arguments: 143 filename: the Unicode data file to read, or a file-like object. 144 nfields: the number of expected fields per line in that file. 145 doline: the function to call for each table entry. 146 147 Raises: 148 InputError: nfields is invalid (must be >= 2). 149 """ 150 151 if nfields < 2: 152 raise InputError("invalid number of fields %d" % (nfields,)) 153 154 if type(filename) == str: 155 if filename.startswith("https://"): 156 fil = urllib.request.urlopen(filename) 157 else: 158 fil = open(filename, "rb") 159 else: 160 fil = filename 161 162 first = None # first code in multiline range 163 expect_last = None # tag expected for "Last" line in multiline range 164 lineno = 0 # current line number 165 for line in fil: 166 lineno += 1 167 try: 168 line = line.decode('latin1') 169 170 # Chop # comments and white space; ignore empty lines. 171 sharp = line.find("#") 172 if sharp >= 0: 173 line = line[:sharp] 174 line = line.strip() 175 if not line: 176 continue 177 178 # Split fields on ";", chop more white space. 179 # Must have the expected number of fields. 180 fields = [s.strip() for s in line.split(";")] 181 if len(fields) != nfields: 182 raise InputError("wrong number of fields %d %d - %s" % 183 (len(fields), nfields, line)) 184 185 # The Unicode text files have two different ways 186 # to list a Unicode range. Either the first field is 187 # itself a range (0000..FFFF), or the range is split 188 # across two lines, with the second field noting 189 # the continuation. 190 codes = _URange(fields[0]) 191 (name, cont) = _ParseContinue(fields[1]) 192 193 if expect_last is not None: 194 # If the last line gave the First code in a range, 195 # this one had better give the Last one. 196 if (len(codes) != 1 or codes[0] <= first or 197 cont != "Last" or name != expect_last): 198 raise InputError("expected Last line for %s" % 199 (expect_last,)) 200 codes = range(first, codes[0] + 1) 201 first = None 202 expect_last = None 203 fields[0] = "%04X..%04X" % (codes[0], codes[-1]) 204 fields[1] = name 205 elif cont == "First": 206 # Otherwise, if this is the First code in a range, 207 # remember it and go to the next line. 208 if len(codes) != 1: 209 raise InputError("bad First line: range given") 210 expect_last = name 211 first = codes[0] 212 continue 213 214 doline(codes, fields) 215 216 except Exception as e: 217 print("%s:%d: %s" % (filename, lineno, e)) 218 raise 219 220 if expect_last is not None: 221 raise InputError("expected Last line for %s; got EOF" % 222 (expect_last,)) 223 224 225def CaseGroups(unicode_dir=_UNICODE_DIR): 226 """Returns list of Unicode code groups equivalent under case folding. 227 228 Each group is a sorted list of code points, 229 and the list of groups is sorted by first code point 230 in the group. 231 232 Args: 233 unicode_dir: Unicode data directory 234 235 Returns: 236 list of Unicode code groups 237 """ 238 239 # Dict mapping lowercase code point to fold-equivalent group. 240 togroup = {} 241 242 def DoLine(codes, fields): 243 """Process single CaseFolding.txt line, updating togroup.""" 244 (_, foldtype, lower, _) = fields 245 if foldtype not in ("C", "S"): 246 return 247 lower = _UInt(lower) 248 togroup.setdefault(lower, [lower]).extend(codes) 249 250 ReadUnicodeTable(unicode_dir+"/CaseFolding.txt", 4, DoLine) 251 252 groups = list(togroup.values()) 253 for g in groups: 254 g.sort() 255 groups.sort() 256 return togroup, groups 257 258 259def Scripts(unicode_dir=_UNICODE_DIR): 260 """Returns dict mapping script names to code lists. 261 262 Args: 263 unicode_dir: Unicode data directory 264 265 Returns: 266 dict mapping script names to code lists 267 """ 268 269 scripts = {} 270 271 def DoLine(codes, fields): 272 """Process single Scripts.txt line, updating scripts.""" 273 (_, name) = fields 274 scripts.setdefault(name, []).extend(codes) 275 276 ReadUnicodeTable(unicode_dir+"/Scripts.txt", 2, DoLine) 277 return scripts 278 279 280def Categories(unicode_dir=_UNICODE_DIR): 281 """Returns dict mapping category names to code lists. 282 283 Args: 284 unicode_dir: Unicode data directory 285 286 Returns: 287 dict mapping category names to code lists 288 """ 289 290 categories = {} 291 292 def DoLine(codes, fields): 293 """Process single UnicodeData.txt line, updating categories.""" 294 category = fields[2] 295 categories.setdefault(category, []).extend(codes) 296 # Add codes from Lu into L, etc. 297 if len(category) > 1: 298 short = category[0] 299 categories.setdefault(short, []).extend(codes) 300 301 ReadUnicodeTable(unicode_dir+"/UnicodeData.txt", 15, DoLine) 302 return categories 303 304