• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#! /usr/bin/python
2
3#                   PCRE2 UNICODE PROPERTY SUPPORT
4#                   ------------------------------
5#
6# This file auto-generates unicode property tests and their expected output.
7# It is recommended to re-run this generator after the unicode files are
8# updated. The names of the generated files are `testinput26` and `testoutput26`
9
10import re
11import sys
12
13from GenerateCommon import \
14  script_names, \
15  script_abbrevs
16
17def write_both(text):
18  input_file.write(text)
19  output_file.write(text)
20
21def to_string_char(ch_idx):
22  if ch_idx < 128:
23    if ch_idx < 16:
24      return "\\x{0%x}" % ch_idx
25    if ch_idx >= 32:
26      return chr(ch_idx)
27  return "\\x{%x}" % ch_idx
28
29output_directory = ""
30
31if len(sys.argv) > 2:
32  print('** Too many arguments: just give a directory name')
33  sys.exit(1)
34if len(sys.argv) == 2:
35  output_directory = sys.argv[1]
36  if not output_directory.endswith("/"):
37    output_directory += "/"
38
39try:
40  input_file = open(output_directory + "testinput26", "w")
41  output_file = open(output_directory + "testoutput26", "w")
42except IOError:
43  print ("** Couldn't open output files")
44  sys.exit(1)
45
46write_both("# These tests are generated by maint/GenerateTest26.py, do not edit.\n\n")
47
48# ---------------------------------------------------------------------------
49#                      UNICODE SCRIPT EXTENSION TESTS
50# ---------------------------------------------------------------------------
51
52write_both("# Unicode Script Extension tests.\n\n")
53
54def gen_script_tests():
55  script_data = [None] * len(script_names)
56  char_data = [None] * 0x110000
57
58  property_re = re.compile("^([0-9A-F]{4,6})(?:\\.\\.([0-9A-F]{4,6}))? +; ([A-Za-z_ ]+) #")
59  prev_name = ""
60  script_idx = -1
61
62  with open("Unicode.tables/Scripts.txt") as f:
63    for line in f:
64      match_obj = property_re.match(line)
65
66      if match_obj == None:
67        continue
68
69      name = match_obj.group(3)
70      if name != prev_name:
71        script_idx = script_names.index(name)
72        prev_name = name
73
74      low = int(match_obj.group(1), 16)
75      high = low
76      char_data[low] = name
77
78      if match_obj.group(2) != None:
79        high = int(match_obj.group(2), 16)
80        for idx in range(low + 1, high + 1):
81           char_data[idx] = name
82
83      if script_data[script_idx] == None:
84        script_data[script_idx] = [low, None, None, None, None]
85      script_data[script_idx][1] = high
86
87  extended_script_indicies = {}
88
89  with open("Unicode.tables/ScriptExtensions.txt") as f:
90    for line in f:
91      match_obj = property_re.match(line)
92
93      if match_obj == None:
94        continue
95
96      low = int(match_obj.group(1), 16)
97      high = low
98      if match_obj.group(2) != None:
99        high = int(match_obj.group(2), 16)
100
101      for abbrev in match_obj.group(3).split(" "):
102        if abbrev not in extended_script_indicies:
103          idx = script_abbrevs.index(abbrev)
104          extended_script_indicies[abbrev] = idx
105          rec = script_data[idx]
106          rec[2] = low
107          rec[3] = high
108        else:
109          idx = extended_script_indicies[abbrev]
110          rec = script_data[idx]
111          if rec[2] > low:
112            rec[2] = low
113          if rec[3] < high:
114            rec[3] = high
115
116        if rec[4] == None:
117          name = script_names[idx]
118          for idx in range(low, high + 1):
119            if char_data[idx] != name:
120              rec[4] = idx
121              break
122
123  long_property_name = False
124
125  for idx, rec in enumerate(script_data):
126    script_name = script_names[idx]
127
128    if script_name == "Unknown":
129      continue
130
131    script_abbrev = script_abbrevs[idx]
132
133    write_both("# Base script check\n")
134    write_both("/^\\p{sc=%s}/utf\n" % script_name)
135    write_both("  %s\n" % to_string_char(rec[0]))
136    output_file.write(" 0: %s\n" % to_string_char(rec[0]))
137    write_both("\n")
138
139    write_both("/^\\p{Script=%s}/utf\n" % script_abbrev)
140    write_both("  %s\n" % to_string_char(rec[1]))
141    output_file.write(" 0: %s\n" % to_string_char(rec[1]))
142    write_both("\n")
143
144    if rec[2] != None:
145      property_name = "scx"
146      if long_property_name:
147        property_name = "Script_Extensions"
148
149      write_both("# Script extension check\n")
150      write_both("/^\\p{%s}/utf\n" % script_name)
151      write_both("  %s\n" % to_string_char(rec[2]))
152      output_file.write(" 0: %s\n" % to_string_char(rec[2]))
153      write_both("\n")
154
155      write_both("/^\\p{%s=%s}/utf\n" % (property_name, script_abbrev))
156      write_both("  %s\n" % to_string_char(rec[3]))
157      output_file.write(" 0: %s\n" % to_string_char(rec[3]))
158      write_both("\n")
159
160      long_property_name = not long_property_name
161
162      if rec[4] != None:
163        write_both("# Script extension only character\n")
164        write_both("/^\\p{%s}/utf\n" % script_name)
165        write_both("  %s\n" % to_string_char(rec[4]))
166        output_file.write(" 0: %s\n" % to_string_char(rec[4]))
167        write_both("\n")
168
169        write_both("/^\\p{sc=%s}/utf\n" % script_name)
170        write_both("  %s\n" % to_string_char(rec[4]))
171        output_file.write("No match\n")
172        write_both("\n")
173      else:
174        print("External character has not found for %s" % script_name)
175
176    high = rec[1]
177    if rec[3] != None and rec[3] > rec[1]:
178      high = rec[3]
179    write_both("# Character not in script\n")
180    write_both("/^\\p{%s}/utf\n" % script_name)
181    write_both("  %s\n" % to_string_char(high + 1))
182    output_file.write("No match\n")
183    write_both("\n")
184
185
186gen_script_tests()
187
188write_both("# End of testinput26\n")
189