• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/python
2
3#                   PCRE2 UNICODE PROPERTY SUPPORT
4#                   ------------------------------
5#
6# This script generates the pcre2_ucd.c file from Unicode data files. This is
7# the compressed Unicode property data used by PCRE2. The script was created in
8# December 2021 as part of the Unicode data generation refactoring. It is
9# basically a re-working of the MultiStage2.py script that was submitted to the
10# PCRE project by Peter Kankowski in 2008 as part of a previous upgrading of
11# Unicode property support. A number of extensions have since been added. The
12# main difference in the 2021 upgrade (apart from comments and layout) is that
13# the data tables (e.g. list of script names) are now listed in or generated by
14# a separate Python module that is shared with the other Generate scripts.
15#
16# This script must be run in the "maint" directory. It requires the following
17# Unicode data tables: BidiMirrorring.txt, CaseFolding.txt,
18# DerivedBidiClass.txt, DerivedCoreProperties.txt, DerivedGeneralCategory.txt,
19# GraphemeBreakProperty.txt, PropList.txt, PropertyAliases.txt,
20# PropertyValueAliases.txt, ScriptExtensions.txt, Scripts.txt, and
21# emoji-data.txt. These must be in the Unicode.tables subdirectory.
22#
23# The emoji-data.txt file is found in the "emoji" subdirectory even though it
24# is technically part of a different (but coordinated) standard as shown
25# in files associated with Unicode Technical Standard #51 ("Unicode Emoji"),
26# for example:
27#
28# http://unicode.org/Public/emoji/13.0/ReadMe.txt
29#
30# DerivedBidiClass.txt and DerivedGeneralCategory.txt are in the "extracted"
31# subdirectory of the Unicode database (UCD) on the Unicode web site;
32# GraphemeBreakProperty.txt is in the "auxiliary" subdirectory. The other files
33# are in the top-level UCD directory.
34#
35# -----------------------------------------------------------------------------
36# Minor modifications made to the original script:
37#  Added #! line at start
38#  Removed tabs
39#  Made it work with Python 2.4 by rewriting two statements that needed 2.5
40#  Consequent code tidy
41#  Adjusted data file names to take from the Unicode.tables directory
42#  Adjusted global table names by prefixing _pcre_.
43#  Commented out stuff relating to the casefolding table, which isn't used;
44#    removed completely in 2012.
45#  Corrected size calculation
46#  Add #ifndef SUPPORT_UCP to use dummy tables when no UCP support is needed.
47#  Update for PCRE2: name changes, and SUPPORT_UCP is abolished.
48#
49# Major modifications made to the original script:
50#  Added code to add a grapheme break property field to records.
51#
52#  Added code to search for sets of more than two characters that must match
53#  each other caselessly. A new table is output containing these sets, and
54#  offsets into the table are added to the main output records. This new
55#  code scans CaseFolding.txt instead of UnicodeData.txt, which is no longer
56#  used.
57#
58#  Update for Python3:
59#    . Processed with 2to3, but that didn't fix everything
60#    . Changed string.strip to str.strip
61#    . Added encoding='utf-8' to the open() call
62#    . Inserted 'int' before blocksize/ELEMS_PER_LINE because an int is
63#        required and the result of the division is a float
64#
65#  Added code to scan the emoji-data.txt file to find the Extended Pictographic
66#  property, which is used by PCRE2 as a grapheme breaking property. This was
67#  done when updating to Unicode 11.0.0 (July 2018).
68#
69#  Added code to add a Script Extensions field to records. This has increased
70#  their size from 8 to 12 bytes, only 10 of which are currently used.
71#
72#  Added code to add a bidi class field to records by scanning the
73#  DerivedBidiClass.txt and PropList.txt files. This uses one of the two spare
74#  bytes, so now 11 out of 12 are in use.
75#
76# 01-March-2010:     Updated list of scripts for Unicode 5.2.0
77# 30-April-2011:     Updated list of scripts for Unicode 6.0.0
78#     July-2012:     Updated list of scripts for Unicode 6.1.0
79# 20-August-2012:    Added scan of GraphemeBreakProperty.txt and added a new
80#                      field in the record to hold the value. Luckily, the
81#                      structure had a hole in it, so the resulting table is
82#                      not much bigger than before.
83# 18-September-2012: Added code for multiple caseless sets. This uses the
84#                      final hole in the structure.
85# 30-September-2012: Added RegionalIndicator break property from Unicode 6.2.0
86# 13-May-2014:       Updated for PCRE2
87# 03-June-2014:      Updated for Python 3
88# 20-June-2014:      Updated for Unicode 7.0.0
89# 12-August-2014:    Updated to put Unicode version into the file
90# 19-June-2015:      Updated for Unicode 8.0.0
91# 02-July-2017:      Updated for Unicode 10.0.0
92# 03-July-2018:      Updated for Unicode 11.0.0
93# 07-July-2018:      Added code to scan emoji-data.txt for the Extended
94#                      Pictographic property.
95# 01-October-2018:   Added the 'Unknown' script name
96# 03-October-2018:   Added new field for Script Extensions
97# 27-July-2019:      Updated for Unicode 12.1.0
98# 10-March-2020:     Updated for Unicode 13.0.0
99# PCRE2-10.39:       Updated for Unicode 14.0.0
100# 05-December-2021:  Added code to scan DerivedBidiClass.txt for bidi class,
101#                      and also PropList.txt for the Bidi_Control property
102# 19-December-2021:  Reworked script extensions lists to be bit maps instead
103#                      of zero-terminated lists of script numbers.
104# ----------------------------------------------------------------------------
105#
106# Changes to the refactored script:
107#
108# 26-December-2021:  Refactoring completed
109# 10-January-2022:   Addition of general Boolean property support
110# 12-January-2022:   Merge scriptx and bidiclass fields
111# 14-January-2022:   Enlarge Boolean property offset to 12 bits
112#
113# ----------------------------------------------------------------------------
114#
115#
116# The main tables generated by this script are used by macros defined in
117# pcre2_internal.h. They look up Unicode character properties using short
118# sequences of code that contains no branches, which makes for greater speed.
119#
120# Conceptually, there is a table of records (of type ucd_record), one for each
121# Unicode character. Each record contains the script number, script extension
122# value, character type, grapheme break type, offset to caseless matching set,
123# offset to the character's other case, the bidi class, and offset to bitmap of
124# Boolean properties.
125#
126# A real table covering all Unicode characters would be far too big. It can be
127# efficiently compressed by observing that many characters have the same
128# record, and many blocks of characters (taking 128 characters in a block) have
129# the same set of records as other blocks. This leads to a 2-stage lookup
130# process.
131#
132# This script constructs seven tables. The ucd_caseless_sets table contains
133# lists of characters that all match each other caselessly. Each list is
134# in order, and is terminated by NOTACHAR (0xffffffff), which is larger than
135# any valid character. The first list is empty; this is used for characters
136# that are not part of any list.
137#
138# The ucd_digit_sets table contains the code points of the '9' characters in
139# each set of 10 decimal digits in Unicode. This is used to ensure that digits
140# in script runs all come from the same set. The first element in the vector
141# contains the number of subsequent elements, which are in ascending order.
142#
143# Scripts are partitioned into two groups. Scripts that appear in at least one
144# character's script extension list come first, followed by "Unknown" and then
145# all the rest. This sorting is done automatically in the GenerateCommon.py
146# script. A script's number is its index in the script_names list.
147#
148# The ucd_script_sets table contains bitmaps that represent lists of scripts
149# for Script Extensions properties. Each bitmap consists of a fixed number of
150# unsigned 32-bit numbers, enough to allocate a bit for every script that is
151# used in any character's extension list, that is, enough for every script
152# whose number is less than ucp_Unknown. A character's script extension value
153# in its ucd record is an offset into the ucd_script_sets vector. The first
154# bitmap has no bits set; characters that have no script extensions have zero
155# as their script extensions value so that they use this map.
156#
157# The ucd_boolprop_sets table contains bitmaps that represent lists of Boolean
158# properties. Each bitmap consists of a fixed number of unsigned 32-bit
159# numbers, enough to allocate a bit for each supported Boolean property.
160#
161# The ucd_records table contains one instance of every unique character record
162# that is required. The ucd_stage1 table is indexed by a character's block
163# number, which is the character's code point divided by 128, since 128 is the
164# size of each block. The result of a lookup in ucd_stage1 a "virtual" block
165# number.
166#
167# The ucd_stage2 table is a table of "virtual" blocks; each block is indexed by
168# the offset of a character within its own block, and the result is the index
169# number of the required record in the ucd_records vector.
170#
171# The following examples are correct for the Unicode 14.0.0 database. Future
172# updates may make change the actual lookup values.
173#
174# Example: lowercase "a" (U+0061) is in block 0
175#          lookup 0 in stage1 table yields 0
176#          lookup 97 (0x61) in the first table in stage2 yields 35
177#          record 35 is { 0, 5, 12, 0, -32, 18432, 44 }
178#             0 = ucp_Latin   => Latin script
179#             5 = ucp_Ll      => Lower case letter
180#            12 = ucp_gbOther => Grapheme break property "Other"
181#             0               => Not part of a caseless set
182#           -32 (-0x20)       => Other case is U+0041
183#         18432 = 0x4800      => Combined Bidi class + script extension values
184#            44               => Offset to Boolean properties
185#
186# The top 5 bits of the sixth field are the Bidi class, with the rest being the
187# script extension value, giving:
188#
189#             9 = ucp_bidiL   => Bidi class left-to-right
190#             0               => No special script extension property
191#
192# Almost all lowercase latin characters resolve to the same record. One or two
193# are different because they are part of a multi-character caseless set (for
194# example, k, K and the Kelvin symbol are such a set).
195#
196# Example: hiragana letter A (U+3042) is in block 96 (0x60)
197#          lookup 96 in stage1 table yields 93
198#          lookup 66 (0x42) in table 93 in stage2 yields 819
199#          record 819 is { 20, 7, 12, 0, 0, 18432, 82 }
200#            20 = ucp_Hiragana => Hiragana script
201#             7 = ucp_Lo       => Other letter
202#            12 = ucp_gbOther  => Grapheme break property "Other"
203#             0                => Not part of a caseless set
204#             0                => No other case
205#         18432 = 0x4800       => Combined Bidi class + script extension values
206#            82                => Offset to Boolean properties
207#
208# The top 5 bits of the sixth field are the Bidi class, with the rest being the
209# script extension value, giving:
210#
211#             9 = ucp_bidiL   => Bidi class left-to-right
212#             0               => No special script extension property
213#
214# Example: vedic tone karshana (U+1CD0) is in block 57 (0x39)
215#          lookup 57 in stage1 table yields 55
216#          lookup 80 (0x50) in table 55 in stage2 yields 621
217#          record 621 is { 84, 12, 3, 0, 0, 26762, 96 }
218#            84 = ucp_Inherited => Script inherited from predecessor
219#            12 = ucp_Mn        => Non-spacing mark
220#             3 = ucp_gbExtend  => Grapheme break property "Extend"
221#             0                 => Not part of a caseless set
222#             0                 => No other case
223#         26762 = 0x688A        => Combined Bidi class + script extension values
224#            96                 => Offset to Boolean properties
225#
226# The top 5 bits of the sixth field are the Bidi class, with the rest being the
227# script extension value, giving:
228#
229#            13 = ucp_bidiNSM   => Bidi class non-spacing mark
230#           138                 => Script Extension list offset = 138
231#
232# At offset 138 in the ucd_script_sets vector we find a bitmap with bits 1, 8,
233# 18, and 47 set. This means that this character is expected to be used with
234# any of those scripts, which are Bengali, Devanagari, Kannada, and Grantha.
235#
236#  Philip Hazel, last updated 14 January 2022.
237##############################################################################
238
239
240# Import standard modules
241
242import re
243import string
244import sys
245
246# Import common data lists and functions
247
248from GenerateCommon import \
249  bidi_classes, \
250  bool_properties, \
251  bool_propsfiles, \
252  bool_props_list_item_size, \
253  break_properties, \
254  category_names, \
255  general_category_names, \
256  script_abbrevs, \
257  script_list_item_size, \
258  script_names, \
259  open_output
260
261# Some general parameters
262
263MAX_UNICODE = 0x110000
264NOTACHAR = 0xffffffff
265
266
267# ---------------------------------------------------------------------------
268#                         DEFINE FUNCTIONS
269# ---------------------------------------------------------------------------
270
271
272# Parse a line of Scripts.txt, GraphemeBreakProperty.txt, DerivedBidiClass.txt
273# or DerivedGeneralCategory.txt
274
275def make_get_names(enum):
276  return lambda chardata: enum.index(chardata[1])
277
278
279# Parse a line of CaseFolding.txt
280
281def get_other_case(chardata):
282  if chardata[1] == 'C' or chardata[1] == 'S':
283    return int(chardata[2], 16) - int(chardata[0], 16)
284  return 0
285
286
287# Parse a line of ScriptExtensions.txt
288
289def get_script_extension(chardata):
290  global last_script_extension
291
292  offset = len(script_lists) * script_list_item_size
293  if last_script_extension == chardata[1]:
294    return offset - script_list_item_size
295
296  last_script_extension = chardata[1]
297  script_lists.append(tuple(script_abbrevs.index(abbrev) for abbrev in last_script_extension.split(' ')))
298  return offset
299
300
301# Read a whole table in memory, setting/checking the Unicode version
302
303def read_table(file_name, get_value, default_value):
304  global unicode_version
305
306  f = re.match(r'^[^/]+/([^.]+)\.txt$', file_name)
307  file_base = f.group(1)
308  version_pat = r"^# " + re.escape(file_base) + r"-(\d+\.\d+\.\d+)\.txt$"
309  file = open(file_name, 'r', encoding='utf-8')
310  f = re.match(version_pat, file.readline())
311  version = f.group(1)
312  if unicode_version == "":
313    unicode_version = version
314  elif unicode_version != version:
315    print("WARNING: Unicode version differs in %s", file_name, file=sys.stderr)
316
317  table = [default_value] * MAX_UNICODE
318  for line in file:
319    line = re.sub(r'#.*', '', line)
320    chardata = list(map(str.strip, line.split(';')))
321    if len(chardata) <= 1:
322      continue
323    value = get_value(chardata)
324    m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
325    char = int(m.group(1), 16)
326    if m.group(3) is None:
327      last = char
328    else:
329      last = int(m.group(3), 16)
330    for i in range(char, last + 1):
331      # It is important not to overwrite a previously set value because in the
332      # CaseFolding file there are lines to be ignored (returning the default
333      # value of 0) which often come after a line which has already set data.
334      if table[i] == default_value:
335        table[i] = value
336  file.close()
337  return table
338
339
340# Get the smallest possible C language type for the values in a table
341
342def get_type_size(table):
343  type_size = [("uint8_t", 1), ("uint16_t", 2), ("uint32_t", 4),
344    ("signed char", 1), ("int16_t", 2), ("int32_t", 4)]
345  limits = [(0, 255), (0, 65535), (0, 4294967295), (-128, 127),
346    (-32768, 32767), (-2147483648, 2147483647)]
347  minval = min(table)
348  maxval = max(table)
349  for num, (minlimit, maxlimit) in enumerate(limits):
350    if minlimit <= minval and maxval <= maxlimit:
351      return type_size[num]
352  raise OverflowError("Too large to fit into C types")
353
354
355# Get the total size of a list of tables
356
357def get_tables_size(*tables):
358  total_size = 0
359  for table in tables:
360    type, size = get_type_size(table)
361    total_size += size * len(table)
362  return total_size
363
364
365# Compress a table into the two stages
366
367def compress_table(table, block_size):
368  blocks = {} # Dictionary for finding identical blocks
369  stage1 = [] # Stage 1 table contains block numbers (indices into stage 2 table)
370  stage2 = [] # Stage 2 table contains the blocks with property values
371  table = tuple(table)
372  for i in range(0, len(table), block_size):
373    block = table[i:i+block_size]
374    start = blocks.get(block)
375    if start is None:
376      # Allocate a new block
377      start = len(stage2) / block_size
378      stage2 += block
379      blocks[block] = start
380    stage1.append(start)
381  return stage1, stage2
382
383
384# Output a table
385
386def write_table(table, table_name, block_size = None):
387  type, size = get_type_size(table)
388  ELEMS_PER_LINE = 16
389
390  s = "const %s %s[] = { /* %d bytes" % (type, table_name, size * len(table))
391  if block_size:
392    s += ", block = %d" % block_size
393  f.write(s + " */\n")
394  table = tuple(table)
395  if block_size is None:
396    fmt = "%3d," * ELEMS_PER_LINE + " /* U+%04X */\n"
397    mult = MAX_UNICODE / len(table)
398    for i in range(0, len(table), ELEMS_PER_LINE):
399      f.write(fmt % (table[i:i+ELEMS_PER_LINE] + (int(i * mult),)))
400  else:
401    if block_size > ELEMS_PER_LINE:
402      el = ELEMS_PER_LINE
403    else:
404      el = block_size
405    fmt = "%3d," * el + "\n"
406    if block_size > ELEMS_PER_LINE:
407      fmt = fmt * int(block_size / ELEMS_PER_LINE)
408    for i in range(0, len(table), block_size):
409      f.write(("\n/* block %d */\n" + fmt) % ((i / block_size,) + table[i:i+block_size]))
410  f.write("};\n\n")
411
412
413# Extract the unique combinations of properties into records
414
415def combine_tables(*tables):
416  records = {}
417  index = []
418  for t in zip(*tables):
419    i = records.get(t)
420    if i is None:
421      i = records[t] = len(records)
422    index.append(i)
423  return index, records
424
425
426# Create a record struct
427
428def get_record_size_struct(records):
429  size = 0
430  structure = 'typedef struct {\n'
431  for i in range(len(records[0])):
432    record_slice = [record[i] for record in records]
433    slice_type, slice_size = get_type_size(record_slice)
434    # add padding: round up to the nearest power of slice_size
435    size = (size + slice_size - 1) & -slice_size
436    size += slice_size
437    structure += '%s property_%d;\n' % (slice_type, i)
438
439  # round up to the first item of the next structure in array
440  record_slice = [record[0] for record in records]
441  slice_type, slice_size = get_type_size(record_slice)
442  size = (size + slice_size - 1) & -slice_size
443
444  structure += '} ucd_record;\n*/\n'
445  return size, structure
446
447
448# Write records
449
450def write_records(records, record_size):
451  f.write('const ucd_record PRIV(ucd_records)[] = { ' + \
452    '/* %d bytes, record size %d */\n' % (len(records) * record_size, record_size))
453  records = list(zip(list(records.keys()), list(records.values())))
454  records.sort(key = lambda x: x[1])
455  for i, record in enumerate(records):
456    f.write(('  {' + '%6d, ' * len(record[0]) + '}, /* %3d */\n') % (record[0] + (i,)))
457  f.write('};\n\n')
458
459
460# Write a bit set
461
462def write_bitsets(list, item_size):
463  for d in list:
464    bitwords = [0] * item_size
465    for idx in d:
466      bitwords[idx // 32] |= 1 << (idx & 31)
467    s = " "
468    for x in bitwords:
469      f.write("%s" % s)
470      s = ", "
471      f.write("0x%08xu" % x)
472    f.write(",\n")
473  f.write("};\n\n")
474
475
476# ---------------------------------------------------------------------------
477# This bit of code must have been useful when the original script was being
478# developed. Retain it just in case it is ever needed again.
479
480# def test_record_size():
481#   tests = [ \
482#     ( [(3,), (6,), (6,), (1,)], 1 ), \
483#     ( [(300,), (600,), (600,), (100,)], 2 ), \
484#     ( [(25, 3), (6, 6), (34, 6), (68, 1)], 2 ), \
485#     ( [(300, 3), (6, 6), (340, 6), (690, 1)], 4 ), \
486#     ( [(3, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
487#     ( [(300, 300), (6, 6), (6, 340), (1, 690)], 4 ), \
488#     ( [(3, 100000), (6, 6), (6, 123456), (1, 690)], 8 ), \
489#     ( [(100000, 300), (6, 6), (123456, 6), (1, 690)], 8 ), \
490#   ]
491#   for test in tests:
492#     size, struct = get_record_size_struct(test[0])
493#     assert(size == test[1])
494# test_record_size()
495# ---------------------------------------------------------------------------
496
497
498
499# ---------------------------------------------------------------------------
500#                       MAIN CODE FOR CREATING TABLES
501# ---------------------------------------------------------------------------
502
503unicode_version = ""
504
505# Some of the tables imported from GenerateCommon.py have alternate comment
506# strings for use by GenerateUcpHeader. The comments are not wanted here, so
507# remove them.
508
509bidi_classes = bidi_classes[::2]
510break_properties = break_properties[::2]
511category_names = category_names[::2]
512
513# Create the various tables from Unicode data files
514
515script = read_table('Unicode.tables/Scripts.txt', make_get_names(script_names), script_names.index('Unknown'))
516category = read_table('Unicode.tables/DerivedGeneralCategory.txt', make_get_names(category_names), category_names.index('Cn'))
517break_props = read_table('Unicode.tables/GraphemeBreakProperty.txt', make_get_names(break_properties), break_properties.index('Other'))
518other_case = read_table('Unicode.tables/CaseFolding.txt', get_other_case, 0)
519bidi_class = read_table('Unicode.tables/DerivedBidiClass.txt', make_get_names(bidi_classes), bidi_classes.index('L'))
520
521# The grapheme breaking rules were changed for Unicode 11.0.0 (June 2018). Now
522# we need to find the Extended_Pictographic property for emoji characters. This
523# can be set as an additional grapheme break property, because the default for
524# all the emojis is "other". We scan the emoji-data.txt file and modify the
525# break-props table.
526
527file = open('Unicode.tables/emoji-data.txt', 'r', encoding='utf-8')
528for line in file:
529  line = re.sub(r'#.*', '', line)
530  chardata = list(map(str.strip, line.split(';')))
531  if len(chardata) <= 1:
532    continue
533  if chardata[1] != "Extended_Pictographic":
534    continue
535  m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', chardata[0])
536  char = int(m.group(1), 16)
537  if m.group(3) is None:
538    last = char
539  else:
540    last = int(m.group(3), 16)
541  for i in range(char, last + 1):
542    if break_props[i] != break_properties.index('Other'):
543      print("WARNING: Emoji 0x%x has break property %s, not 'Other'",
544        i, break_properties[break_props[i]], file=sys.stderr)
545    break_props[i] = break_properties.index('Extended_Pictographic')
546file.close()
547
548# Handle script extensions. The get_script_extesion() function maintains a
549# list of unique bitmaps representing lists of scripts, returning the offset
550# in that list. Initialize the list with an empty set, which is used for
551# characters that have no script extensions.
552
553script_lists = [[]]
554last_script_extension = ""
555scriptx_bidi_class = read_table('Unicode.tables/ScriptExtensions.txt', get_script_extension, 0)
556
557for idx in range(len(scriptx_bidi_class)):
558  scriptx_bidi_class[idx] = scriptx_bidi_class[idx] | (bidi_class[idx] << 11)
559bidi_class = None
560
561# Find the Boolean properties of each character. This next bit of magic creates
562# a list of empty lists. Using [[]] * MAX_UNICODE gives a list of references to
563# the *same* list, which is not what we want.
564
565bprops = [[] for _ in range(MAX_UNICODE)]
566
567# Collect the properties from the various files
568
569for filename in bool_propsfiles:
570  try:
571    file = open('Unicode.tables/' + filename, 'r')
572  except IOError:
573    print(f"** Couldn't open {'Unicode.tables/' + filename}\n")
574    sys.exit(1)
575
576  for line in file:
577    line = re.sub(r'#.*', '', line)
578    data = list(map(str.strip, line.split(';')))
579    if len(data) <= 1:
580      continue
581
582    try:
583      ix = bool_properties.index(data[1])
584    except ValueError:
585      continue
586
587    m = re.match(r'([0-9a-fA-F]+)(\.\.([0-9a-fA-F]+))?$', data[0])
588    char = int(m.group(1), 16)
589    if m.group(3) is None:
590      last = char
591    else:
592      last = int(m.group(3), 16)
593
594    for i in range(char, last + 1):
595      bprops[i].append(ix)
596
597  file.close()
598
599# The ASCII property isn't listed in any files, but it is easy enough to add
600# it manually.
601
602ix = bool_properties.index("ASCII")
603for i in range(128):
604  bprops[i].append(ix)
605
606# The Bidi_Mirrored property isn't listed in any property files. We have to
607# deduce it from the file that lists the mirrored characters.
608
609ix = bool_properties.index("Bidi_Mirrored")
610
611try:
612  file = open('Unicode.tables/BidiMirroring.txt', 'r')
613except IOError:
614  print(f"** Couldn't open {'Unicode.tables/BidiMirroring.txt'}\n")
615  sys.exit(1)
616
617for line in file:
618  line = re.sub(r'#.*', '', line)
619  data = list(map(str.strip, line.split(';')))
620  if len(data) <= 1:
621    continue
622  c = int(data[0], 16)
623  bprops[c].append(ix)
624
625file.close()
626
627# Scan each character's boolean property list and created a list of unique
628# lists, at the same time, setting the index in that list for each property in
629# the bool_props vector.
630
631bool_props = [0] * MAX_UNICODE
632bool_props_lists = [[]]
633
634for c in range(MAX_UNICODE):
635  s = set(bprops[c])
636  for i in range(len(bool_props_lists)):
637    if s == set(bool_props_lists[i]):
638      break;
639  else:
640    bool_props_lists.append(bprops[c])
641    i += 1
642
643  bool_props[c] = i * bool_props_list_item_size
644
645# This block of code was added by PH in September 2012. It scans the other_case
646# table to find sets of more than two characters that must all match each other
647# caselessly. Later in this script a table of these sets is written out.
648# However, we have to do this work here in order to compute the offsets in the
649# table that are inserted into the main table.
650
651# The CaseFolding.txt file lists pairs, but the common logic for reading data
652# sets only one value, so first we go through the table and set "return"
653# offsets for those that are not already set.
654
655for c in range(MAX_UNICODE):
656  if other_case[c] != 0 and other_case[c + other_case[c]] == 0:
657    other_case[c + other_case[c]] = -other_case[c]
658
659# Now scan again and create equivalence sets.
660
661caseless_sets = []
662
663for c in range(MAX_UNICODE):
664  o = c + other_case[c]
665
666  # Trigger when this character's other case does not point back here. We
667  # now have three characters that are case-equivalent.
668
669  if other_case[o] != -other_case[c]:
670    t = o + other_case[o]
671
672    # Scan the existing sets to see if any of the three characters are already
673    # part of a set. If so, unite the existing set with the new set.
674
675    appended = 0
676    for s in caseless_sets:
677      found = 0
678      for x in s:
679        if x == c or x == o or x == t:
680          found = 1
681
682      # Add new characters to an existing set
683
684      if found:
685        found = 0
686        for y in [c, o, t]:
687          for x in s:
688            if x == y:
689              found = 1
690          if not found:
691            s.append(y)
692        appended = 1
693
694    # If we have not added to an existing set, create a new one.
695
696    if not appended:
697      caseless_sets.append([c, o, t])
698
699# End of loop looking for caseless sets.
700
701# Now scan the sets and set appropriate offsets for the characters.
702
703caseless_offsets = [0] * MAX_UNICODE
704
705offset = 1;
706for s in caseless_sets:
707  for x in s:
708    caseless_offsets[x] = offset
709  offset += len(s) + 1
710
711# End of block of code for creating offsets for caseless matching sets.
712
713
714# Combine all the tables
715
716table, records = combine_tables(script, category, break_props,
717  caseless_offsets, other_case, scriptx_bidi_class, bool_props)
718
719# Find the record size and create a string definition of the structure for
720# outputting as a comment.
721
722record_size, record_struct = get_record_size_struct(list(records.keys()))
723
724# Find the optimum block size for the two-stage table
725
726min_size = sys.maxsize
727for block_size in [2 ** i for i in range(5,10)]:
728  size = len(records) * record_size
729  stage1, stage2 = compress_table(table, block_size)
730  size += get_tables_size(stage1, stage2)
731  #print "/* block size %5d  => %5d bytes */" % (block_size, size)
732  if size < min_size:
733    min_size = size
734    min_stage1, min_stage2 = stage1, stage2
735    min_block_size = block_size
736
737
738# ---------------------------------------------------------------------------
739#                   MAIN CODE FOR WRITING THE OUTPUT FILE
740# ---------------------------------------------------------------------------
741
742# Open the output file (no return on failure). This call also writes standard
743# header boilerplate.
744
745f = open_output("pcre2_ucd.c")
746
747# Output this file's heading text
748
749f.write("""\
750/* This file contains tables of Unicode properties that are extracted from
751Unicode data files. See the comments at the start of maint/GenerateUcd.py for
752details.
753
754As well as being part of the PCRE2 library, this file is #included by the
755pcre2test program, which redefines the PRIV macro to change table names from
756_pcre2_xxx to xxxx, thereby avoiding name clashes with the library. At present,
757just one of these tables is actually needed. When compiling the library, some
758headers are needed. */
759
760#ifndef PCRE2_PCRE2TEST
761#ifdef HAVE_CONFIG_H
762#include "config.h"
763#endif
764#include "pcre2_internal.h"
765#endif /* PCRE2_PCRE2TEST */
766
767/* The tables herein are needed only when UCP support is built, and in PCRE2
768that happens automatically with UTF support. This module should not be
769referenced otherwise, so it should not matter whether it is compiled or not.
770However a comment was received about space saving - maybe the guy linked all
771the modules rather than using a library - so we include a condition to cut out
772the tables when not needed. But don't leave a totally empty module because some
773compilers barf at that. Instead, just supply some small dummy tables. */
774
775#ifndef SUPPORT_UNICODE
776const ucd_record PRIV(ucd_records)[] = {{0,0,0,0,0,0,0}};
777const uint16_t PRIV(ucd_stage1)[] = {0};
778const uint16_t PRIV(ucd_stage2)[] = {0};
779const uint32_t PRIV(ucd_caseless_sets)[] = {0};
780#else
781\n""")
782
783# --- Output some variable heading stuff ---
784
785f.write("/* Total size: %d bytes, block size: %d. */\n\n" % (min_size, min_block_size))
786f.write('const char *PRIV(unicode_version) = "{}";\n\n'.format(unicode_version))
787
788f.write("""\
789/* When recompiling tables with a new Unicode version, please check the types
790in this structure definition with those in pcre2_internal.h (the actual field
791names will be different).
792\n""")
793
794f.write(record_struct)
795
796f.write("""
797/* If the 32-bit library is run in non-32-bit mode, character values greater
798than 0x10ffff may be encountered. For these we set up a special record. */
799
800#if PCRE2_CODE_UNIT_WIDTH == 32
801const ucd_record PRIV(dummy_ucd_record)[] = {{
802  ucp_Unknown,    /* script */
803  ucp_Cn,         /* type unassigned */
804  ucp_gbOther,    /* grapheme break property */
805  0,              /* case set */
806  0,              /* other case */
807  0 | (ucp_bidiL << UCD_BIDICLASS_SHIFT), /* script extension and bidi class */
808  0,              /* bool properties offset */
809  }};
810#endif
811\n""")
812
813# --- Output the table of caseless character sets ---
814
815f.write("""\
816/* This table contains lists of characters that are caseless sets of
817more than one character. Each list is terminated by NOTACHAR. */
818
819const uint32_t PRIV(ucd_caseless_sets)[] = {
820  NOTACHAR,
821""")
822
823for s in caseless_sets:
824  s = sorted(s)
825  for x in s:
826    f.write('  0x%04x,' % x)
827  f.write('  NOTACHAR,\n')
828f.write('};\n\n')
829
830# --- Other tables are not needed by pcre2test ---
831
832f.write("""\
833/* When #included in pcre2test, we don't need the table of digit sets, nor the
834the large main UCD tables. */
835
836#ifndef PCRE2_PCRE2TEST
837\n""")
838
839# --- Read Scripts.txt again for the sets of 10 digits. ---
840
841digitsets = []
842file = open('Unicode.tables/Scripts.txt', 'r', encoding='utf-8')
843
844for line in file:
845  m = re.match(r'([0-9a-fA-F]+)\.\.([0-9a-fA-F]+)\s+;\s+\S+\s+#\s+Nd\s+', line)
846  if m is None:
847    continue
848  first = int(m.group(1),16)
849  last  = int(m.group(2),16)
850  if ((last - first + 1) % 10) != 0:
851    f.write("ERROR: %04x..%04x does not contain a multiple of 10 characters" % (first, last),
852      file=sys.stderr)
853  while first < last:
854    digitsets.append(first + 9)
855    first += 10
856file.close()
857digitsets.sort()
858
859f.write("""\
860/* This table lists the code points for the '9' characters in each set of
861decimal digits. It is used to ensure that all the digits in a script run come
862from the same set. */
863
864const uint32_t PRIV(ucd_digit_sets)[] = {
865""")
866
867f.write("  %d,  /* Number of subsequent values */" % len(digitsets))
868count = 8
869for d in digitsets:
870  if count == 8:
871    f.write("\n ")
872    count = 0
873  f.write(" 0x%05x," % d)
874  count += 1
875f.write("\n};\n\n")
876
877f.write("""\
878/* This vector is a list of script bitsets for the Script Extension property.
879The number of 32-bit words in each bitset is #defined in pcre2_ucp.h as
880ucd_script_sets_item_size. */
881
882const uint32_t PRIV(ucd_script_sets)[] = {
883""")
884write_bitsets(script_lists, script_list_item_size)
885
886f.write("""\
887/* This vector is a list of bitsets for Boolean properties. The number of
88832_bit words in each bitset is #defined as ucd_boolprop_sets_item_size in
889pcre2_ucp.h. */
890
891const uint32_t PRIV(ucd_boolprop_sets)[] = {
892""")
893write_bitsets(bool_props_lists, bool_props_list_item_size)
894
895
896# Output the main UCD tables.
897
898f.write("""\
899/* These are the main two-stage UCD tables. The fields in each record are:
900script (8 bits), character type (8 bits), grapheme break property (8 bits),
901offset to multichar other cases or zero (8 bits), offset to other case or zero
902(32 bits, signed), bidi class (5 bits) and script extension (11 bits) packed
903into a 16-bit field, and offset in binary properties table (16 bits). */
904\n""")
905
906write_records(records, record_size)
907write_table(min_stage1, 'PRIV(ucd_stage1)')
908write_table(min_stage2, 'PRIV(ucd_stage2)', min_block_size)
909
910f.write("#if UCD_BLOCK_SIZE != %d\n" % min_block_size)
911f.write("""\
912#error Please correct UCD_BLOCK_SIZE in pcre2_internal.h
913#endif
914#endif  /* SUPPORT_UNICODE */
915
916#endif  /* PCRE2_PCRE2TEST */
917
918/* End of pcre2_ucd.c */
919""")
920
921f.close
922
923# End
924