1import re 2from unicodedata import ucd_3_2_0 as unicodedata 3 4def gen_category(cats): 5 for i in range(0, 0x110000): 6 if unicodedata.category(chr(i)) in cats: 7 yield(i) 8 9def gen_bidirectional(cats): 10 for i in range(0, 0x110000): 11 if unicodedata.bidirectional(chr(i)) in cats: 12 yield(i) 13 14def compact_set(l): 15 single = [] 16 tuple = [] 17 prev = None 18 span = 0 19 for e in l: 20 if prev is None: 21 prev = e 22 span = 0 23 continue 24 if prev+span+1 != e: 25 if span > 2: 26 tuple.append((prev,prev+span+1)) 27 else: 28 for i in range(prev, prev+span+1): 29 single.append(i) 30 prev = e 31 span = 0 32 else: 33 span += 1 34 if span: 35 tuple.append((prev,prev+span+1)) 36 else: 37 single.append(prev) 38 if not single and len(tuple) == 1: 39 tuple = "range(%d,%d)" % tuple[0] 40 else: 41 tuple = " + ".join("list(range(%d,%d))" % t for t in tuple) 42 if not single: 43 return "set(%s)" % tuple 44 if not tuple: 45 return "set(%r)" % (single,) 46 return "set(%r + %s)" % (single, tuple) 47 48############## Read the tables in the RFC ####################### 49 50with open("rfc3454.txt") as f: 51 data = f.readlines() 52 53tables = [] 54curname = None 55for l in data: 56 l = l.strip() 57 if not l: 58 continue 59 # Skip RFC page breaks 60 if l.startswith(("Hoffman & Blanchet", "RFC 3454")): 61 continue 62 # Find start/end lines 63 m = re.match("----- (Start|End) Table ([A-Z](.[0-9])+) -----", l) 64 if m: 65 if m.group(1) == "Start": 66 if curname: 67 raise RuntimeError("Double Start", (curname, l)) 68 curname = m.group(2) 69 table = {} 70 tables.append((curname, table)) 71 continue 72 else: 73 if not curname: 74 raise RuntimeError("End without start", l) 75 if curname != m.group(2): 76 raise RuntimeError("Unexpected end", l) 77 curname = None 78 continue 79 if not curname: 80 continue 81 # Now we are in a table 82 fields = l.split(";") 83 if len(fields) > 1: 84 # Drop comment field 85 fields = fields[:-1] 86 if len(fields) == 1: 87 fields = fields[0].split("-") 88 if len(fields) > 1: 89 # range 90 try: 91 start, end = fields 92 except ValueError: 93 raise RuntimeError("Unpacking problem", l) 94 else: 95 start = end = fields[0] 96 start = int(start, 16) 97 end = int(end, 16) 98 for i in range(start, end+1): 99 table[i] = i 100 else: 101 code, value = fields 102 value = value.strip() 103 if value: 104 value = [int(v, 16) for v in value.split(" ")] 105 else: 106 # table B.1 107 value = None 108 table[int(code, 16)] = value 109 110########### Generate compact Python versions of the tables ############# 111 112print("""# This file is generated by mkstringprep.py. DO NOT EDIT. 113\"\"\"Library that exposes various tables found in the StringPrep RFC 3454. 114 115There are two kinds of tables: sets, for which a member test is provided, 116and mappings, for which a mapping function is provided. 117\"\"\" 118 119from unicodedata import ucd_3_2_0 as unicodedata 120""") 121 122print("assert unicodedata.unidata_version == %r" % (unicodedata.unidata_version,)) 123 124# A.1 is the table of unassigned characters 125# XXX Plane 15 PUA is listed as unassigned in Python. 126name, table = tables[0] 127del tables[0] 128assert name == "A.1" 129table = set(table.keys()) 130Cn = set(gen_category(["Cn"])) 131 132# FDD0..FDEF are process internal codes 133Cn -= set(range(0xFDD0, 0xFDF0)) 134# not a character 135Cn -= set(range(0xFFFE, 0x110000, 0x10000)) 136Cn -= set(range(0xFFFF, 0x110000, 0x10000)) 137 138# assert table == Cn 139 140print(""" 141def in_table_a1(code): 142 if unicodedata.category(code) != 'Cn': return False 143 c = ord(code) 144 if 0xFDD0 <= c < 0xFDF0: return False 145 return (c & 0xFFFF) not in (0xFFFE, 0xFFFF) 146""") 147 148# B.1 cannot easily be derived 149name, table = tables[0] 150del tables[0] 151assert name == "B.1" 152table = sorted(table.keys()) 153print(""" 154b1_set = """ + compact_set(table) + """ 155def in_table_b1(code): 156 return ord(code) in b1_set 157""") 158 159# B.2 and B.3 is case folding. 160# It takes CaseFolding.txt into account, which is 161# not available in the Python database. Since 162# B.2 is derived from B.3, we process B.3 first. 163# B.3 supposedly *is* CaseFolding-3.2.0.txt. 164 165name, table_b2 = tables[0] 166del tables[0] 167assert name == "B.2" 168 169name, table_b3 = tables[0] 170del tables[0] 171assert name == "B.3" 172 173# B.3 is mostly Python's .lower, except for a number 174# of special cases, e.g. considering canonical forms. 175 176b3_exceptions = {} 177 178for k,v in table_b2.items(): 179 if list(map(ord, chr(k).lower())) != v: 180 b3_exceptions[k] = "".join(map(chr,v)) 181 182b3 = sorted(b3_exceptions.items()) 183 184print(""" 185b3_exceptions = {""") 186for i, kv in enumerate(b3): 187 print("0x%x:%a," % kv, end=' ') 188 if i % 4 == 3: 189 print() 190print("}") 191 192print(""" 193def map_table_b3(code): 194 r = b3_exceptions.get(ord(code)) 195 if r is not None: return r 196 return code.lower() 197""") 198 199def map_table_b3(code): 200 r = b3_exceptions.get(ord(code)) 201 if r is not None: return r 202 return code.lower() 203 204# B.2 is case folding for NFKC. This is the same as B.3, 205# except where NormalizeWithKC(Fold(a)) != 206# NormalizeWithKC(Fold(NormalizeWithKC(Fold(a)))) 207 208def map_table_b2(a): 209 al = map_table_b3(a) 210 b = unicodedata.normalize("NFKC", al) 211 bl = "".join([map_table_b3(ch) for ch in b]) 212 c = unicodedata.normalize("NFKC", bl) 213 if b != c: 214 return c 215 else: 216 return al 217 218specials = {} 219for k,v in table_b2.items(): 220 if list(map(ord, map_table_b2(chr(k)))) != v: 221 specials[k] = v 222 223# B.3 should not add any additional special cases 224assert specials == {} 225 226print(""" 227def map_table_b2(a): 228 al = map_table_b3(a) 229 b = unicodedata.normalize("NFKC", al) 230 bl = "".join([map_table_b3(ch) for ch in b]) 231 c = unicodedata.normalize("NFKC", bl) 232 if b != c: 233 return c 234 else: 235 return al 236""") 237 238# C.1.1 is a table with a single character 239name, table = tables[0] 240del tables[0] 241assert name == "C.1.1" 242assert table == {0x20:0x20} 243 244print(""" 245def in_table_c11(code): 246 return code == " " 247""") 248 249# C.1.2 is the rest of all space characters 250name, table = tables[0] 251del tables[0] 252assert name == "C.1.2" 253 254# table = set(table.keys()) 255# Zs = set(gen_category(["Zs"])) - {0x20} 256# assert Zs == table 257 258print(""" 259def in_table_c12(code): 260 return unicodedata.category(code) == "Zs" and code != " " 261 262def in_table_c11_c12(code): 263 return unicodedata.category(code) == "Zs" 264""") 265 266# C.2.1 ASCII control characters 267name, table_c21 = tables[0] 268del tables[0] 269assert name == "C.2.1" 270 271Cc = set(gen_category(["Cc"])) 272Cc_ascii = Cc & set(range(128)) 273table_c21 = set(table_c21.keys()) 274assert Cc_ascii == table_c21 275 276print(""" 277def in_table_c21(code): 278 return ord(code) < 128 and unicodedata.category(code) == "Cc" 279""") 280 281# C.2.2 Non-ASCII control characters. It also includes 282# a number of characters in category Cf. 283name, table_c22 = tables[0] 284del tables[0] 285assert name == "C.2.2" 286 287Cc_nonascii = Cc - Cc_ascii 288table_c22 = set(table_c22.keys()) 289assert len(Cc_nonascii - table_c22) == 0 290 291specials = list(table_c22 - Cc_nonascii) 292specials.sort() 293 294print("""c22_specials = """ + compact_set(specials) + """ 295def in_table_c22(code): 296 c = ord(code) 297 if c < 128: return False 298 if unicodedata.category(code) == "Cc": return True 299 return c in c22_specials 300 301def in_table_c21_c22(code): 302 return unicodedata.category(code) == "Cc" or \\ 303 ord(code) in c22_specials 304""") 305 306# C.3 Private use 307name, table = tables[0] 308del tables[0] 309assert name == "C.3" 310 311Co = set(gen_category(["Co"])) 312assert set(table.keys()) == Co 313 314print(""" 315def in_table_c3(code): 316 return unicodedata.category(code) == "Co" 317""") 318 319# C.4 Non-character code points, xFFFE, xFFFF 320# plus process internal codes 321name, table = tables[0] 322del tables[0] 323assert name == "C.4" 324 325nonchar = set(range(0xFDD0,0xFDF0)) 326nonchar.update(range(0xFFFE,0x110000,0x10000)) 327nonchar.update(range(0xFFFF,0x110000,0x10000)) 328table = set(table.keys()) 329assert table == nonchar 330 331print(""" 332def in_table_c4(code): 333 c = ord(code) 334 if c < 0xFDD0: return False 335 if c < 0xFDF0: return True 336 return (ord(code) & 0xFFFF) in (0xFFFE, 0xFFFF) 337""") 338 339# C.5 Surrogate codes 340name, table = tables[0] 341del tables[0] 342assert name == "C.5" 343 344Cs = set(gen_category(["Cs"])) 345assert set(table.keys()) == Cs 346 347print(""" 348def in_table_c5(code): 349 return unicodedata.category(code) == "Cs" 350""") 351 352# C.6 Inappropriate for plain text 353name, table = tables[0] 354del tables[0] 355assert name == "C.6" 356 357table = sorted(table.keys()) 358 359print(""" 360c6_set = """ + compact_set(table) + """ 361def in_table_c6(code): 362 return ord(code) in c6_set 363""") 364 365# C.7 Inappropriate for canonical representation 366name, table = tables[0] 367del tables[0] 368assert name == "C.7" 369 370table = sorted(table.keys()) 371 372print(""" 373c7_set = """ + compact_set(table) + """ 374def in_table_c7(code): 375 return ord(code) in c7_set 376""") 377 378# C.8 Change display properties or are deprecated 379name, table = tables[0] 380del tables[0] 381assert name == "C.8" 382 383table = sorted(table.keys()) 384 385print(""" 386c8_set = """ + compact_set(table) + """ 387def in_table_c8(code): 388 return ord(code) in c8_set 389""") 390 391# C.9 Tagging characters 392name, table = tables[0] 393del tables[0] 394assert name == "C.9" 395 396table = sorted(table.keys()) 397 398print(""" 399c9_set = """ + compact_set(table) + """ 400def in_table_c9(code): 401 return ord(code) in c9_set 402""") 403 404# D.1 Characters with bidirectional property "R" or "AL" 405name, table = tables[0] 406del tables[0] 407assert name == "D.1" 408 409RandAL = set(gen_bidirectional(["R","AL"])) 410assert set(table.keys()) == RandAL 411 412print(""" 413def in_table_d1(code): 414 return unicodedata.bidirectional(code) in ("R","AL") 415""") 416 417# D.2 Characters with bidirectional property "L" 418name, table = tables[0] 419del tables[0] 420assert name == "D.2" 421 422L = set(gen_bidirectional(["L"])) 423assert set(table.keys()) == L 424 425print(""" 426def in_table_d2(code): 427 return unicodedata.bidirectional(code) == "L" 428""") 429