• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python2.4 -B
2#
3# Copyright (C) 2017 and later: Unicode, Inc. and others.
4# License & terms of use: http://www.unicode.org/copyright.html
5#
6# Copyright (c) 2009-2010 International Business Machines
7# Corporation and others. All Rights Reserved.
8#
9#   file name:  ucdcopy.py
10#   encoding:   US-ASCII
11#   tab size:   8 (not used)
12#   indentation:4
13#
14#   created on: 2009aug04
15#   created by: Markus W. Scherer
16#
17# Copy Unicode Character Database (ucd) files from a tree
18# of files downloaded from ftp://www.unicode.org/Public/5.2.0/
19# to a folder like ICU's source/data/unidata/
20# and modify some of the files to make them more compact.
21#
22# Invoke with two command-line parameters, for the source
23# and destination folders.
24
25import os
26import os.path
27import re
28import shutil
29import sys
30
31_strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*")
32_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;")
33
34def CopyAndStripWithOptionalMerge(s, t, do_merge):
35  in_file = open(s, "r")
36  out_file = open(t, "w")
37  first = -1  # First code point with first_data.
38  last = -1  # Last code point with first_data.
39  first_data = ""  # Common data for code points [first..last].
40  for line in in_file:
41    match = _strip_re.match(line)
42    if match:
43      line = match.group(1)
44    else:
45      line = line.rstrip()
46    if do_merge:
47      match = _code_point_re.match(line)
48      if match:
49        c = int(match.group(1), 16)
50        data = line[match.end() - 1:]
51      else:
52        c = -1
53        data = ""
54      if last >= 0 and (c != (last + 1) or data != first_data):
55        # output the current range
56        if first == last:
57          out_file.write("%04X%s\n" % (first, first_data))
58        else:
59          out_file.write("%04X..%04X%s\n" % (first, last, first_data))
60        first = -1
61        last = -1
62        first_data = ""
63      if c < 0:
64        # no data on this line, output as is
65        out_file.write(line)
66        out_file.write("\n")
67      else:
68        # data on this line, store for possible range compaction
69        if last < 0:
70          # set as the first line in a possible range
71          first = c
72          last = c
73          first_data = data
74        else:
75          # must be c == (last + 1) and data == first_data
76          # because of previous conditions
77          # continue with the current range
78          last = c
79    else:
80      # Only strip, don't merge: just output the stripped line.
81      out_file.write(line)
82      out_file.write("\n")
83  if do_merge and last >= 0:
84    # output the last range in the file
85    if first == last:
86      out_file.write("%04X%s\n" % (first, first_data))
87    else:
88      out_file.write("%04X..%04X%s\n" % (first, last, first_data))
89    first = -1
90    last = -1
91    first_data = ""
92  in_file.close()
93  out_file.flush()
94  out_file.close()
95
96
97def CopyAndStrip(s, t):
98  """Copies a file and removes comments behind data lines but not in others."""
99  CopyAndStripWithOptionalMerge(s, t, False)
100
101
102def CopyAndStripAndMerge(s, t):
103  """Copies and strips a file and merges lines.
104
105  Copies a file, removes comments, and
106  merges lines with adjacent code point ranges and identical per-code point
107  data lines into one line with range syntax.
108  """
109  CopyAndStripWithOptionalMerge(s, t, True)
110
111
112_files = {
113  # Simply copy these files.
114  "BidiMirroring.txt": shutil.copy,
115  "BidiTest.txt": (shutil.copy, "testdata"),
116  "Blocks.txt": shutil.copy,
117  "CaseFolding.txt": shutil.copy,
118  "DerivedAge.txt": shutil.copy,
119  "DerivedBidiClass.txt": shutil.copy,
120  "DerivedJoiningGroup.txt": shutil.copy,
121  "DerivedJoiningType.txt": shutil.copy,
122  "DerivedNumericValues.txt": shutil.copy,
123  "GraphemeBreakTest.txt": (shutil.copy, "testdata"),
124  "LineBreakTest.txt": (shutil.copy, "testdata"),
125  "NameAliases.txt": shutil.copy,
126  "NormalizationCorrections.txt": shutil.copy,
127  "PropertyAliases.txt": shutil.copy,
128  "PropertyValueAliases.txt": shutil.copy,
129  "SentenceBreakTest.txt": (shutil.copy, "testdata"),
130  "ScriptExtensions.txt": shutil.copy,
131  "SpecialCasing.txt": shutil.copy,
132  "UnicodeData.txt": shutil.copy,
133  "WordBreakTest.txt": (shutil.copy, "testdata"),
134
135  # Copy these files and remove comments behind data lines but not in others.
136  "DerivedCoreProperties.txt": CopyAndStrip,
137  "DerivedNormalizationProps.txt": CopyAndStrip,
138  "GraphemeBreakProperty.txt": CopyAndStrip,
139  "NormalizationTest.txt": CopyAndStrip,
140  "PropList.txt": CopyAndStrip,
141  "Scripts.txt": CopyAndStrip,
142  "SentenceBreakProperty.txt": CopyAndStrip,
143  "WordBreakProperty.txt": CopyAndStrip,
144
145  # Also merge lines with adjacent code point ranges.
146  "EastAsianWidth.txt": CopyAndStripAndMerge,
147  "LineBreak.txt": CopyAndStripAndMerge
148}
149
150_file_version_re = re.compile("^([a-zA-Z0-9]+)" +
151                              "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" +
152                              "(\\.[a-z]+)$")
153
154def main():
155  source_root = sys.argv[1]
156  dest_root = sys.argv[2]
157  source_files = []
158  for root, dirs, files in os.walk(source_root):
159    for file in files:
160      source_files.append(os.path.join(root, file))
161  files_processed = set()
162  for source_file in source_files:
163    basename = os.path.basename(source_file)
164    match = _file_version_re.match(basename)
165    if match:
166      basename = match.group(1) + match.group(2)
167      print basename
168    if basename in _files:
169      if basename in files_processed:
170        print "duplicate file basename %s!" % basename
171        sys.exit(1)
172      files_processed.add(basename)
173      action = _files[basename]
174      if isinstance(action, tuple):
175        dest_folder = action[1]
176        action = action[0]
177      else:
178        dest_folder = "unidata"
179      dest_path = os.path.join(dest_root, dest_folder)
180      if not os.path.exists(dest_path): os.makedirs(dest_path)
181      dest_file = os.path.join(dest_path, basename)
182      action(source_file, dest_file)
183
184
185if __name__ == "__main__":
186  main()
187