1#! /usr/bin/python 2 3# Multistage table builder 4# (c) Peter Kankowski, 2008 5 6############################################################################## 7# This script was submitted to the PCRE project by Peter Kankowski as part of 8# the upgrading of Unicode property support. The new code speeds up property 9# matching many times. The script is for the use of PCRE maintainers, to 10# generate the pcre2_ucd.c file that contains a digested form of the Unicode 11# data tables. A number of extensions have been added to the original script. 12# 13# The script has now been upgraded to Python 3 for PCRE2, and should be run in 14# the maint subdirectory, using the command 15# 16# [python3] ./MultiStage2.py >../src/pcre2_ucd.c 17# 18# It requires six Unicode data tables: DerivedGeneralCategory.txt, 19# GraphemeBreakProperty.txt, Scripts.txt, ScriptExtensions.txt, 20# CaseFolding.txt, and emoji-data.txt. These must be in the 21# maint/Unicode.tables subdirectory. 22# 23# DerivedGeneralCategory.txt is found in the "extracted" subdirectory of the 24# Unicode database (UCD) on the Unicode web site; GraphemeBreakProperty.txt is 25# in the "auxiliary" subdirectory. Scripts.txt, ScriptExtensions.txt, and 26# CaseFolding.txt are directly in the UCD directory. 27# 28# The emoji-data.txt file is found in the "emoji" subdirectory even though it 29# is technically part of a different (but coordinated) standard as shown 30# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"), 31# for example: 32# 33# http://unicode.org/Public/emoji/13.0/ReadMe.txt 34# 35# ----------------------------------------------------------------------------- 36# Minor modifications made to this script: 37# Added #! line at start 38# Removed tabs 39# Made it work with Python 2.4 by rewriting two statements that needed 2.5 40# Consequent code tidy 41# Adjusted data file names to take from the Unicode.tables directory 42# Adjusted global table names by prefixing _pcre_. 43# Commented out stuff relating to the casefolding table, which isn't used; 44# removed completely in 2012. 45# Corrected size calculation 46# Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed. 47# Update for PCRE2: name changes, and SUPPORT_UCP is abolished. 48# 49# Major modifications made to this script: 50# Added code to add a grapheme break property field to records. 51# 52# Added code to search for sets of more than two characters that must match 53# each other caselessly. A new table is output containing these sets, and 54# offsets into the table are added to the main output records. This new 55# code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer 56# used. 57# 58# Update for Python3: 59# . Processed with 2to3, but that didn't fix everything 60# . Changed string.strip to str.strip 61# . Added encoding='utf-8' to the open() call 62# . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is 63# required and the result of the division is a float 64# 65# Added code to scan the emoji-data.txt file to find the Extended Pictographic 66# property, which is used by PCRE2 as a grapheme breaking property. This was 67# done when updating to Unicode 11.0.0 (July 2018). 68# 69# Added code to add a Script Extensions field to records. This has increased 70# their size from 8 to 12 bytes, only 10 of which are currently used. 71# 72# 01-March-2010: Updated list of scripts for Unicode 5.2.0 73# 30-April-2011: Updated list of scripts for Unicode 6.0.0 74# July-2012: Updated list of scripts for Unicode 6.1.0 75# 20-August-2012: Added scan of GraphemeBreakProperty.txt and added a new 76# field in the record to hold the value. Luckily, the 77# structure had a hole in it, so the resulting table is 78# not much bigger than before. 79# 18-September-2012: Added code for multiple caseless sets. This uses the 80# final hole in the structure. 81# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0 82# 13-May-2014: Updated for PCRE2 83# 03-June-2014: Updated for Python 3 84# 20-June-2014: Updated for Unicode 7.0.0 85# 12-August-2014: Updated to put Unicode version into the file 86# 19-June-2015: Updated for Unicode 8.0.0 87# 02-July-2017: Updated for Unicode 10.0.0 88# 03-July-2018: Updated for Unicode 11.0.0 89# 07-July-2018: Added code to scan emoji-data.txt for the Extended 90# Pictographic property. 91# 01-October-2018: Added the 'Unknown' script name 92# 03-October-2018: Added new field for Script Extensions 93# 27-July-2019: Updated for Unicode 12.1.0 94# 10-March-2020: Updated for Unicode 13.0.0 95# PCRE2-10.39: Updated for Unicode 14.0.0 96# ---------------------------------------------------------------------------- 97# 98# 99# The main tables generated by this script are used by macros defined in 100# pcre2_internal.h. They look up Unicode character properties using short 101# sequences of code that contains no branches, which makes for greater speed. 102# 103# Conceptually, there is a table of records (of type ucd_record), containing a 104# script number, script extension value, character type, grapheme break type, 105# offset to caseless matching set, offset to the character's other case, for 106# every Unicode character. However, a real table covering all Unicode 107# characters would be far too big. It can be efficiently compressed by 108# observing that many characters have the same record, and many blocks of 109# characters (taking 128 characters in a block) have the same set of records as 110# other blocks. This leads to a 2-stage lookup process. 111# 112# This script constructs six tables. The ucd_caseless_sets table contains 113# lists of characters that all match each other caselessly. Each list is 114# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than 115# any valid character. The first list is empty; this is used for characters 116# that are not part of any list. 117# 118# The ucd_digit_sets table contains the code points of the '9' characters in 119# each set of 10 decimal digits in Unicode. This is used to ensure that digits 120# in script runs all come from the same set. The first element in the vector 121# contains the number of subsequent elements, which are in ascending order. 122# 123# The ucd_script_sets vector contains lists of script numbers that are the 124# Script Extensions properties of certain characters. Each list is terminated 125# by zero (ucp_Unknown). A character with more than one script listed for its 126# Script Extension property has a negative value in its record. This is the 127# negated offset to the start of the relevant list in the ucd_script_sets 128# vector. 129# 130# The ucd_records table contains one instance of every unique record that is 131# required. The ucd_stage1 table is indexed by a character's block number, 132# which is the character's code point divided by 128, since 128 is the size 133# of each block. The result of a lookup in ucd_stage1 a "virtual" block number. 134# 135# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by 136# the offset of a character within its own block, and the result is the index 137# number of the required record in the ucd_records vector. 138# 139# The following examples are correct for the Unicode 11.0.0 database. Future 140# updates may make change the actual lookup values. 141# 142# Example: lowercase "a" (U+0061) is in block 0 143# lookup 0 in stage1 table yields 0 144# lookup 97 (0x61) in the first table in stage2 yields 17 145# record 17 is { 34, 5, 12, 0, -32, 34, 0 } 146# 34 = ucp_Latin => Latin script 147# 5 = ucp_Ll => Lower case letter 148# 12 = ucp_gbOther => Grapheme break property "Other" 149# 0 => Not part of a caseless set 150# -32 (-0x20) => Other case is U+0041 151# 34 = ucp_Latin => No special Script Extension property 152# 0 => Dummy value, unused at present 153# 154# Almost all lowercase latin characters resolve to the same record. One or two 155# are different because they are part of a multi-character caseless set (for 156# example, k, K and the Kelvin symbol are such a set). 157# 158# Example: hiragana letter A (U+3042) is in block 96 (0x60) 159# lookup 96 in stage1 table yields 90 160# lookup 66 (0x42) in table 90 in stage2 yields 564 161# record 564 is { 27, 7, 12, 0, 0, 27, 0 } 162# 27 = ucp_Hiragana => Hiragana script 163# 7 = ucp_Lo => Other letter 164# 12 = ucp_gbOther => Grapheme break property "Other" 165# 0 => Not part of a caseless set 166# 0 => No other case 167# 27 = ucp_Hiragana => No special Script Extension property 168# 0 => Dummy value, unused at present 169# 170# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39) 171# lookup 57 in stage1 table yields 55 172# lookup 80 (0x50) in table 55 in stage2 yields 458 173# record 458 is { 28, 12, 3, 0, 0, -101, 0 } 174# 28 = ucp_Inherited => Script inherited from predecessor 175# 12 = ucp_Mn => Non-spacing mark 176# 3 = ucp_gbExtend => Grapheme break property "Extend" 177# 0 => Not part of a caseless set 178# 0 => No other case 179# -101 => Script Extension list offset = 101 180# 0 => Dummy value, unused at present 181# 182# At offset 101 in the ucd_script_sets vector we find the list 3, 15, 107, 29, 183# and terminator 0. This means that this character is expected to be used with 184# any of those scripts, which are Bengali, Devanagari, Grantha, and Kannada. 185# 186# Philip Hazel, 03 July 2008 187############################################################################## 188 189 190import re 191import string 192import sys 193 194MAX_UNICODE = 0x110000 195NOTACHAR = 0xffffffff 196 197 198# Parse a line of Scripts.txt, GraphemeBreakProperty.txt or DerivedGeneralCategory.txt 199def make_get_names(enum): 200 return lambda chardata: enum.index(chardata[1]) 201 202# Parse a line of CaseFolding.txt 203def get_other_case(chardata): 204 if chardata[1] == 'C' or chardata[1] == 'S': 205 return int(chardata[2], 16) - int(chardata[0], 16) 206 return 0 207 208# Parse a line of ScriptExtensions.txt 209def get_script_extension(chardata): 210 this_script_list = list(chardata[1].split(' ')) 211 if len(this_script_list) == 1: 212 return script_abbrevs.index(this_script_list[0]) 213 214 script_numbers = [] 215 for d in this_script_list: 216 script_numbers.append(script_abbrevs.index(d)) 217 script_numbers.append(0) 218 script_numbers_length = len(script_numbers) 219 220 for i in range(1, len(script_lists) - script_numbers_length + 1): 221 for j in range(0, script_numbers_length): 222 found = True 223 if script_lists[i+j] != script_numbers[j]: 224 found = False 225 break 226 if found: 227 return -i 228 229 # Not found in existing lists 230 231 return_value = len(script_lists) 232 script_lists.extend(script_numbers) 233 return -return_value 234 235# Read the whole table in memory, setting/checking the Unicode version 236def read_table(file_name, get_value, default_value): 237 global unicode_version 238 239 f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name) 240 file_base = f.group(1) 241 version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$" 242 file = open(file_name, 'r', encoding='utf-8') 243 f = re.match(version_pat, file.readline()) 244 version = f.group(1) 245 if unicode_version == "": 246 unicode_version = version 247 elif unicode_version != version: 248 print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr) 249 250 table = [default_value] * MAX_UNICODE 251 for line in file: 252 line = re.sub(r'#.*', '', line) 253 chardata = list(map(str.strip, line.split(';'))) 254 if len(chardata) <= 1: 255 continue 256 value = get_value(chardata) 257 m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) 258 char = int(m.group(1), 16) 259 if m.group(3) is None: 260 last = char 261 else: 262 last = int(m.group(3), 16) 263 for i in range(char, last + 1): 264 # It is important not to overwrite a previously set 265 # value because in the CaseFolding file there are lines 266 # to be ignored (returning the default value of 0) 267 # which often come after a line which has already set 268 # data. 269 if table[i] == default_value: 270 table[i] = value 271 file.close() 272 return table 273 274# Get the smallest possible C language type for the values 275def get_type_size(table): 276 type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4), 277 ("signed char", 1), ("pcre_int16", 2), ("pcre_int32", 4)] 278 limits = [(0, 255), (0, 65535), (0, 4294967295), 279 (-128, 127), (-32768, 32767), (-2147483648, 2147483647)] 280 minval = min(table) 281 maxval = max(table) 282 for num, (minlimit, maxlimit) in enumerate(limits): 283 if minlimit <= minval and maxval <= maxlimit: 284 return type_size[num] 285 else: 286 raise OverflowError("Too large to fit into C types") 287 288def get_tables_size(*tables): 289 total_size = 0 290 for table in tables: 291 type, size = get_type_size(table) 292 total_size += size * len(table) 293 return total_size 294 295# Compress the table into the two stages 296def compress_table(table, block_size): 297 blocks = {} # Dictionary for finding identical blocks 298 stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table) 299 stage2 = [] # Stage 2 table contains the blocks with property values 300 table = tuple(table) 301 for i in range(0, len(table), block_size): 302 block = table[i:i+block_size] 303 start = blocks.get(block) 304 if start is None: 305 # Allocate a new block 306 start = len(stage2) / block_size 307 stage2 += block 308 blocks[block] = start 309 stage1.append(start) 310 311 return stage1, stage2 312 313# Print a table 314def print_table(table, table_name, block_size = None): 315 type, size = get_type_size(table) 316 ELEMS_PER_LINE = 16 317 318 s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table)) 319 if block_size: 320 s += ", block = %d" % block_size 321 print(s + " */") 322 table = tuple(table) 323 if block_size is None: 324 fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */" 325 mult = MAX_UNICODE / len(table) 326 for i in range(0, len(table), ELEMS_PER_LINE): 327 print(fmt % (table[i:i+ELEMS_PER_LINE] + 328 (int(i * mult),))) 329 else: 330 if block_size > ELEMS_PER_LINE: 331 el = ELEMS_PER_LINE 332 else: 333 el = block_size 334 fmt = "%3d," * el + "\n" 335 if block_size > ELEMS_PER_LINE: 336 fmt = fmt * int(block_size / ELEMS_PER_LINE) 337 for i in range(0, len(table), block_size): 338 print(("/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size])) 339 print("};\n") 340 341# Extract the unique combinations of properties into records 342def combine_tables(*tables): 343 records = {} 344 index = [] 345 for t in zip(*tables): 346 i = records.get(t) 347 if i is None: 348 i = records[t] = len(records) 349 index.append(i) 350 return index, records 351 352def get_record_size_struct(records): 353 size = 0 354 structure = '/* When recompiling tables with a new Unicode version, please check the\n' + \ 355 'types in this structure definition from pcre2_internal.h (the actual\n' + \ 356 'field names will be different):\n\ntypedef struct {\n' 357 for i in range(len(records[0])): 358 record_slice = [record[i] for record in records] 359 slice_type, slice_size = get_type_size(record_slice) 360 # add padding: round up to the nearest power of slice_size 361 size = (size + slice_size - 1) & -slice_size 362 size += slice_size 363 structure += '%s property_%d;\n' % (slice_type, i) 364 365 # round up to the first item of the next structure in array 366 record_slice = [record[0] for record in records] 367 slice_type, slice_size = get_type_size(record_slice) 368 size = (size + slice_size - 1) & -slice_size 369 370 structure += '} ucd_record;\n*/\n' 371 return size, structure 372 373def test_record_size(): 374 tests = [ \ 375 ( [(3,), (6,), (6,), (1,)], 1 ), \ 376 ( [(300,), (600,), (600,), (100,)], 2 ), \ 377 ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \ 378 ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \ 379 ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ 380 ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \ 381 ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \ 382 ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \ 383 ] 384 for test in tests: 385 size, struct = get_record_size_struct(test[0]) 386 assert(size == test[1]) 387 #print struct 388 389def print_records(records, record_size): 390 print('const ucd_record PRIV(ucd_records)[] = { ' + \ 391 '/* %d bytes, record size %d */' % (len(records) * record_size, record_size)) 392 393 records = list(zip(list(records.keys()), list(records.values()))) 394 records.sort(key = lambda x: x[1]) 395 for i, record in enumerate(records): 396 print((' {' + '%6d, ' * len(record[0]) + '}, /* %3d */') % (record[0] + (i,))) 397 print('};\n') 398 399script_names = ['Unknown', 'Arabic', 'Armenian', 'Bengali', 'Bopomofo', 'Braille', 'Buginese', 'Buhid', 'Canadian_Aboriginal', 400 'Cherokee', 'Common', 'Coptic', 'Cypriot', 'Cyrillic', 'Deseret', 'Devanagari', 'Ethiopic', 'Georgian', 401 'Glagolitic', 'Gothic', 'Greek', 'Gujarati', 'Gurmukhi', 'Han', 'Hangul', 'Hanunoo', 'Hebrew', 'Hiragana', 402 'Inherited', 'Kannada', 'Katakana', 'Kharoshthi', 'Khmer', 'Lao', 'Latin', 'Limbu', 'Linear_B', 'Malayalam', 403 'Mongolian', 'Myanmar', 'New_Tai_Lue', 'Ogham', 'Old_Italic', 'Old_Persian', 'Oriya', 'Osmanya', 'Runic', 404 'Shavian', 'Sinhala', 'Syloti_Nagri', 'Syriac', 'Tagalog', 'Tagbanwa', 'Tai_Le', 'Tamil', 'Telugu', 'Thaana', 405 'Thai', 'Tibetan', 'Tifinagh', 'Ugaritic', 'Yi', 406# New for Unicode 5.0 407 'Balinese', 'Cuneiform', 'Nko', 'Phags_Pa', 'Phoenician', 408# New for Unicode 5.1 409 'Carian', 'Cham', 'Kayah_Li', 'Lepcha', 'Lycian', 'Lydian', 'Ol_Chiki', 'Rejang', 'Saurashtra', 'Sundanese', 'Vai', 410# New for Unicode 5.2 411 'Avestan', 'Bamum', 'Egyptian_Hieroglyphs', 'Imperial_Aramaic', 412 'Inscriptional_Pahlavi', 'Inscriptional_Parthian', 413 'Javanese', 'Kaithi', 'Lisu', 'Meetei_Mayek', 414 'Old_South_Arabian', 'Old_Turkic', 'Samaritan', 'Tai_Tham', 'Tai_Viet', 415# New for Unicode 6.0.0 416 'Batak', 'Brahmi', 'Mandaic', 417# New for Unicode 6.1.0 418 'Chakma', 'Meroitic_Cursive', 'Meroitic_Hieroglyphs', 'Miao', 'Sharada', 'Sora_Sompeng', 'Takri', 419# New for Unicode 7.0.0 420 'Bassa_Vah', 'Caucasian_Albanian', 'Duployan', 'Elbasan', 'Grantha', 'Khojki', 'Khudawadi', 421 'Linear_A', 'Mahajani', 'Manichaean', 'Mende_Kikakui', 'Modi', 'Mro', 'Nabataean', 422 'Old_North_Arabian', 'Old_Permic', 'Pahawh_Hmong', 'Palmyrene', 'Psalter_Pahlavi', 423 'Pau_Cin_Hau', 'Siddham', 'Tirhuta', 'Warang_Citi', 424# New for Unicode 8.0.0 425 'Ahom', 'Anatolian_Hieroglyphs', 'Hatran', 'Multani', 'Old_Hungarian', 426 'SignWriting', 427# New for Unicode 10.0.0 428 'Adlam', 'Bhaiksuki', 'Marchen', 'Newa', 'Osage', 'Tangut', 'Masaram_Gondi', 429 'Nushu', 'Soyombo', 'Zanabazar_Square', 430# New for Unicode 11.0.0 431 'Dogra', 'Gunjala_Gondi', 'Hanifi_Rohingya', 'Makasar', 'Medefaidrin', 432 'Old_Sogdian', 'Sogdian', 433# New for Unicode 12.0.0 434 'Elymaic', 'Nandinagari', 'Nyiakeng_Puachue_Hmong', 'Wancho', 435# New for Unicode 13.0.0 436 'Chorasmian', 'Dives_Akuru', 'Khitan_Small_Script', 'Yezidi', 437# New for Unicode 14.0.0 438 'Cypro_Minoan', 'Old_Uyghur', 'Tangsa', 'Toto', 'Vithkuqi' 439 ] 440 441script_abbrevs = [ 442 'Zzzz', 'Arab', 'Armn', 'Beng', 'Bopo', 'Brai', 'Bugi', 'Buhd', 'Cans', 443 'Cher', 'Zyyy', 'Copt', 'Cprt', 'Cyrl', 'Dsrt', 'Deva', 'Ethi', 'Geor', 444 'Glag', 'Goth', 'Grek', 'Gujr', 'Guru', 'Hani', 'Hang', 'Hano', 'Hebr', 445 'Hira', 'Zinh', 'Knda', 'Kana', 'Khar', 'Khmr', 'Laoo', 'Latn', 'Limb', 446 'Linb', 'Mlym', 'Mong', 'Mymr', 'Talu', 'Ogam', 'Ital', 'Xpeo', 'Orya', 447 'Osma', 'Runr', 'Shaw', 'Sinh', 'Sylo', 'Syrc', 'Tglg', 'Tagb', 'Tale', 448 'Taml', 'Telu', 'Thaa', 'Thai', 'Tibt', 'Tfng', 'Ugar', 'Yiii', 449#New for Unicode 5.0 450 'Bali', 'Xsux', 'Nkoo', 'Phag', 'Phnx', 451#New for Unicode 5.1 452 'Cari', 'Cham', 'Kali', 'Lepc', 'Lyci', 'Lydi', 'Olck', 'Rjng', 'Saur', 453 'Sund', 'Vaii', 454#New for Unicode 5.2 455 'Avst', 'Bamu', 'Egyp', 'Armi', 'Phli', 'Prti', 'Java', 'Kthi', 'Lisu', 456 'Mtei', 'Sarb', 'Orkh', 'Samr', 'Lana', 'Tavt', 457#New for Unicode 6.0.0 458 'Batk', 'Brah', 'Mand', 459#New for Unicode 6.1.0 460 'Cakm', 'Merc', 'Mero', 'Plrd', 'Shrd', 'Sora', 'Takr', 461#New for Unicode 7.0.0 462 'Bass', 'Aghb', 'Dupl', 'Elba', 'Gran', 'Khoj', 'Sind', 'Lina', 'Mahj', 463 'Mani', 'Mend', 'Modi', 'Mroo', 'Nbat', 'Narb', 'Perm', 'Hmng', 'Palm', 464 'Phlp', 'Pauc', 'Sidd', 'Tirh', 'Wara', 465#New for Unicode 8.0.0 466 'Ahom', 'Hluw', 'Hatr', 'Mult', 'Hung', 'Sgnw', 467#New for Unicode 10.0.0 468 'Adlm', 'Bhks', 'Marc', 'Newa', 'Osge', 'Tang', 'Gonm', 'Nshu', 'Soyo', 469 'Zanb', 470#New for Unicode 11.0.0 471 'Dogr', 'Gong', 'Rohg', 'Maka', 'Medf', 'Sogo', 'Sogd', 472#New for Unicode 12.0.0 473 'Elym', 'Nand', 'Hmnp', 'Wcho', 474#New for Unicode 13.0.0 475 'Chrs', 'Diak', 'Kits', 'Yezi', 476#New for Unicode 14.0.0 477 'Cpmn', 'Ougr', 'Tngs', 'Toto', 'Vith' 478 ] 479 480category_names = ['Cc', 'Cf', 'Cn', 'Co', 'Cs', 'Ll', 'Lm', 'Lo', 'Lt', 'Lu', 481 'Mc', 'Me', 'Mn', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Pe', 'Pf', 'Pi', 'Po', 'Ps', 482 'Sc', 'Sk', 'Sm', 'So', 'Zl', 'Zp', 'Zs' ] 483 484# The Extended_Pictographic property is not found in the file where all the 485# others are (GraphemeBreakProperty.txt). It comes from the emoji-data.txt 486# file, but we list it here so that the name has the correct index value. 487 488break_property_names = ['CR', 'LF', 'Control', 'Extend', 'Prepend', 489 'SpacingMark', 'L', 'V', 'T', 'LV', 'LVT', 'Regional_Indicator', 'Other', 490 'ZWJ', 'Extended_Pictographic' ] 491 492test_record_size() 493unicode_version = "" 494 495script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown')) 496category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn')) 497break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_property_names), break_property_names.index('Other')) 498other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0) 499 500# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now 501# we need to find the Extended_Pictographic property for emoji characters. This 502# can be set as an additional grapheme break property, because the default for 503# all the emojis is "other". We scan the emoji-data.txt file and modify the 504# break-props table. 505 506file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8') 507for line in file: 508 line = re.sub(r'#.*', '', line) 509 chardata = list(map(str.strip, line.split(';'))) 510 if len(chardata) <= 1: 511 continue 512 513 if chardata[1] != "Extended_Pictographic": 514 continue 515 516 m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0]) 517 char = int(m.group(1), 16) 518 if m.group(3) is None: 519 last = char 520 else: 521 last = int(m.group(3), 16) 522 for i in range(char, last + 1): 523 if break_props[i] != break_property_names.index('Other'): 524 print("WARNING: Emoji 0x%x has break property %s, not 'Other'", 525 i, break_property_names[break_props[i]], file=sys.stderr) 526 break_props[i] = break_property_names.index('Extended_Pictographic') 527file.close() 528 529# The Script Extensions property default value is the Script value. Parse the 530# file, setting 'Unknown' as the default (this will never be a Script Extension 531# value), then scan it and fill in the default from Scripts. Code added by PH 532# in October 2018. Positive values are used for just a single script for a 533# code point. Negative values are negated offsets in a list of lists of 534# multiple scripts. Initialize this list with a single entry, as the zeroth 535# element is never used. 536 537script_lists = [0] 538script_abbrevs_default = script_abbrevs.index('Zzzz') 539scriptx = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, script_abbrevs_default) 540 541for i in range(0, MAX_UNICODE): 542 if scriptx[i] == script_abbrevs_default: 543 scriptx[i] = script[i] 544 545# With the addition of the new Script Extensions field, we need some padding 546# to get the Unicode records up to 12 bytes (multiple of 4). Set a value 547# greater than 255 to make the field 16 bits. 548 549padding_dummy = [0] * MAX_UNICODE 550padding_dummy[0] = 256 551 552# This block of code was added by PH in September 2012. I am not a Python 553# programmer, so the style is probably dreadful, but it does the job. It scans 554# the other_case table to find sets of more than two characters that must all 555# match each other caselessly. Later in this script a table of these sets is 556# written out. However, we have to do this work here in order to compute the 557# offsets in the table that are inserted into the main table. 558 559# The CaseFolding.txt file lists pairs, but the common logic for reading data 560# sets only one value, so first we go through the table and set "return" 561# offsets for those that are not already set. 562 563for c in range(MAX_UNICODE): 564 if other_case[c] != 0 and other_case[c + other_case[c]] == 0: 565 other_case[c + other_case[c]] = -other_case[c] 566 567# Now scan again and create equivalence sets. 568 569sets = [] 570 571for c in range(MAX_UNICODE): 572 o = c + other_case[c] 573 574 # Trigger when this character's other case does not point back here. We 575 # now have three characters that are case-equivalent. 576 577 if other_case[o] != -other_case[c]: 578 t = o + other_case[o] 579 580 # Scan the existing sets to see if any of the three characters are already 581 # part of a set. If so, unite the existing set with the new set. 582 583 appended = 0 584 for s in sets: 585 found = 0 586 for x in s: 587 if x == c or x == o or x == t: 588 found = 1 589 590 # Add new characters to an existing set 591 592 if found: 593 found = 0 594 for y in [c, o, t]: 595 for x in s: 596 if x == y: 597 found = 1 598 if not found: 599 s.append(y) 600 appended = 1 601 602 # If we have not added to an existing set, create a new one. 603 604 if not appended: 605 sets.append([c, o, t]) 606 607# End of loop looking for caseless sets. 608 609# Now scan the sets and set appropriate offsets for the characters. 610 611caseless_offsets = [0] * MAX_UNICODE 612 613offset = 1; 614for s in sets: 615 for x in s: 616 caseless_offsets[x] = offset 617 offset += len(s) + 1 618 619# End of block of code for creating offsets for caseless matching sets. 620 621 622# Combine the tables 623 624table, records = combine_tables(script, category, break_props, 625 caseless_offsets, other_case, scriptx, padding_dummy) 626 627record_size, record_struct = get_record_size_struct(list(records.keys())) 628 629# Find the optimum block size for the two-stage table 630min_size = sys.maxsize 631for block_size in [2 ** i for i in range(5,10)]: 632 size = len(records) * record_size 633 stage1, stage2 = compress_table(table, block_size) 634 size += get_tables_size(stage1, stage2) 635 #print "/* block size %5d => %5d bytes */" % (block_size, size) 636 if size < min_size: 637 min_size = size 638 min_stage1, min_stage2 = stage1, stage2 639 min_block_size = block_size 640 641print("/* This module is generated by the maint/MultiStage2.py script.") 642print("Do not modify it by hand. Instead modify the script and run it") 643print("to regenerate this code.") 644print() 645print("As well as being part of the PCRE2 library, this module is #included") 646print("by the pcre2test program, which redefines the PRIV macro to change") 647print("table names from _pcre2_xxx to xxxx, thereby avoiding name clashes") 648print("with the library. At present, just one of these tables is actually") 649print("needed. */") 650print() 651print("#ifndef PCRE2_PCRE2TEST") 652print() 653print("#ifdef HAVE_CONFIG_H") 654print("#include \"config.h\"") 655print("#endif") 656print() 657print("#include \"pcre2_internal.h\"") 658print() 659print("#endif /* PCRE2_PCRE2TEST */") 660print() 661print("/* Unicode character database. */") 662print("/* This file was autogenerated by the MultiStage2.py script. */") 663print("/* Total size: %d bytes, block size: %d. */" % (min_size, min_block_size)) 664print() 665print("/* The tables herein are needed only when UCP support is built,") 666print("and in PCRE2 that happens automatically with UTF support.") 667print("This module should not be referenced otherwise, so") 668print("it should not matter whether it is compiled or not. However") 669print("a comment was received about space saving - maybe the guy linked") 670print("all the modules rather than using a library - so we include a") 671print("condition to cut out the tables when not needed. But don't leave") 672print("a totally empty module because some compilers barf at that.") 673print("Instead, just supply some small dummy tables. */") 674print() 675print("#ifndef SUPPORT_UNICODE") 676print("const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0 }};") 677print("const uint16_t PRIV(ucd_stage1)[] = {0};") 678print("const uint16_t PRIV(ucd_stage2)[] = {0};") 679print("const uint32_t PRIV(ucd_caseless_sets)[] = {0};") 680print("#else") 681print() 682print("const char *PRIV(unicode_version) = \"{}\";".format(unicode_version)) 683print() 684print("/* If the 32-bit library is run in non-32-bit mode, character values") 685print("greater than 0x10ffff may be encountered. For these we set up a") 686print("special record. */") 687print() 688print("#if PCRE2_CODE_UNIT_WIDTH == 32") 689print("const ucd_record PRIV(dummy_ucd_record)[] = {{") 690print(" ucp_Unknown, /* script */") 691print(" ucp_Cn, /* type unassigned */") 692print(" ucp_gbOther, /* grapheme break property */") 693print(" 0, /* case set */") 694print(" 0, /* other case */") 695print(" ucp_Unknown, /* script extension */") 696print(" 0, /* dummy filler */") 697print(" }};") 698print("#endif") 699print() 700print(record_struct) 701 702# --- Added by PH: output the table of caseless character sets --- 703 704print("/* This table contains lists of characters that are caseless sets of") 705print("more than one character. Each list is terminated by NOTACHAR. */\n") 706 707print("const uint32_t PRIV(ucd_caseless_sets)[] = {") 708print(" NOTACHAR,") 709for s in sets: 710 s = sorted(s) 711 for x in s: 712 print(' 0x%04x,' % x, end=' ') 713 print(' NOTACHAR,') 714print('};') 715print() 716 717# ------ 718 719print("/* When #included in pcre2test, we don't need the table of digit") 720print("sets, nor the the large main UCD tables. */") 721print() 722print("#ifndef PCRE2_PCRE2TEST") 723print() 724 725# --- Added by PH: read Scripts.txt again for the sets of 10 digits. --- 726 727digitsets = [] 728file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8') 729 730for line in file: 731 m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line) 732 if m is None: 733 continue 734 first = int(m.group(1),16) 735 last = int(m.group(2),16) 736 if ((last - first + 1) % 10) != 0: 737 print("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last), 738 file=sys.stderr) 739 while first < last: 740 digitsets.append(first + 9) 741 first += 10 742file.close() 743digitsets.sort() 744 745print("/* This table lists the code points for the '9' characters in each") 746print("set of decimal digits. It is used to ensure that all the digits in") 747print("a script run come from the same set. */\n") 748print("const uint32_t PRIV(ucd_digit_sets)[] = {") 749 750print(" %d, /* Number of subsequent values */" % len(digitsets), end='') 751count = 8 752for d in digitsets: 753 if count == 8: 754 print("\n ", end='') 755 count = 0 756 print(" 0x%05x," % d, end='') 757 count += 1 758print("\n};\n") 759 760print("/* This vector is a list of lists of scripts for the Script Extension") 761print("property. Each sublist is zero-terminated. */\n") 762print("const uint8_t PRIV(ucd_script_sets)[] = {") 763 764count = 0 765print(" /* 0 */", end='') 766for d in script_lists: 767 print(" %3d," % d, end='') 768 count += 1 769 if d == 0: 770 print("\n /* %3d */" % count, end='') 771print("\n};\n") 772 773# Output the main UCD tables. 774 775print("/* These are the main two-stage UCD tables. The fields in each record are:") 776print("script (8 bits), character type (8 bits), grapheme break property (8 bits),") 777print("offset to multichar other cases or zero (8 bits), offset to other case") 778print("or zero (32 bits, signed), script extension (16 bits, signed), and a dummy") 779print("16-bit field to make the whole thing a multiple of 4 bytes. */\n") 780 781print_records(records, record_size) 782print_table(min_stage1, 'PRIV(ucd_stage1)') 783print_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size) 784print("#if UCD_BLOCK_SIZE != %d" % min_block_size) 785print("#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h") 786print("#endif") 787print("#endif /* SUPPORT_UNICODE */") 788print() 789print("#endif /* PCRE2_PCRE2TEST */") 790 791 792# This code was part of the original contribution, but is commented out as it 793# was never used. A two-stage table has sufficed. 794 795""" 796 797# Three-stage tables: 798 799# Find the optimum block size for 3-stage table 800min_size = sys.maxint 801for stage3_block in [2 ** i for i in range(2,6)]: 802 stage_i, stage3 = compress_table(table, stage3_block) 803 for stage2_block in [2 ** i for i in range(5,10)]: 804 size = len(records) * 4 805 stage1, stage2 = compress_table(stage_i, stage2_block) 806 size += get_tables_size(stage1, stage2, stage3) 807 # print "/* %5d / %3d => %5d bytes */" % (stage2_block, stage3_block, size) 808 if size < min_size: 809 min_size = size 810 min_stage1, min_stage2, min_stage3 = stage1, stage2, stage3 811 min_stage2_block, min_stage3_block = stage2_block, stage3_block 812 813print "/* Total size: %d bytes" % min_size */ 814print_records(records) 815print_table(min_stage1, 'ucd_stage1') 816print_table(min_stage2, 'ucd_stage2', min_stage2_block) 817print_table(min_stage3, 'ucd_stage3', min_stage3_block) 818 819""" 820