1#!/usr/bin/python2.4 -B 2# 3# Copyright (C) 2017 and later: Unicode, Inc. and others. 4# License & terms of use: http://www.unicode.org/copyright.html 5# 6# Copyright (c) 2009-2010 International Business Machines 7# Corporation and others. All Rights Reserved. 8# 9# file name: ucdcopy.py 10# encoding: US-ASCII 11# tab size: 8 (not used) 12# indentation:4 13# 14# created on: 2009aug04 15# created by: Markus W. Scherer 16# 17# Copy Unicode Character Database (ucd) files from a tree 18# of files downloaded from ftp://www.unicode.org/Public/5.2.0/ 19# to a folder like ICU's source/data/unidata/ 20# and modify some of the files to make them more compact. 21# 22# Invoke with two command-line parameters, for the source 23# and destination folders. 24 25import os 26import os.path 27import re 28import shutil 29import sys 30 31_strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*") 32_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;") 33 34def CopyAndStripWithOptionalMerge(s, t, do_merge): 35 in_file = open(s, "r") 36 out_file = open(t, "w") 37 first = -1 # First code point with first_data. 38 last = -1 # Last code point with first_data. 39 first_data = "" # Common data for code points [first..last]. 40 for line in in_file: 41 match = _strip_re.match(line) 42 if match: 43 line = match.group(1) 44 else: 45 line = line.rstrip() 46 if do_merge: 47 match = _code_point_re.match(line) 48 if match: 49 c = int(match.group(1), 16) 50 data = line[match.end() - 1:] 51 else: 52 c = -1 53 data = "" 54 if last >= 0 and (c != (last + 1) or data != first_data): 55 # output the current range 56 if first == last: 57 out_file.write("%04X%s\n" % (first, first_data)) 58 else: 59 out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 60 first = -1 61 last = -1 62 first_data = "" 63 if c < 0: 64 # no data on this line, output as is 65 out_file.write(line) 66 out_file.write("\n") 67 else: 68 # data on this line, store for possible range compaction 69 if last < 0: 70 # set as the first line in a possible range 71 first = c 72 last = c 73 first_data = data 74 else: 75 # must be c == (last + 1) and data == first_data 76 # because of previous conditions 77 # continue with the current range 78 last = c 79 else: 80 # Only strip, don't merge: just output the stripped line. 81 out_file.write(line) 82 out_file.write("\n") 83 if do_merge and last >= 0: 84 # output the last range in the file 85 if first == last: 86 out_file.write("%04X%s\n" % (first, first_data)) 87 else: 88 out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 89 first = -1 90 last = -1 91 first_data = "" 92 in_file.close() 93 out_file.flush() 94 out_file.close() 95 96 97def CopyAndStrip(s, t): 98 """Copies a file and removes comments behind data lines but not in others.""" 99 CopyAndStripWithOptionalMerge(s, t, False) 100 101 102def CopyAndStripAndMerge(s, t): 103 """Copies and strips a file and merges lines. 104 105 Copies a file, removes comments, and 106 merges lines with adjacent code point ranges and identical per-code point 107 data lines into one line with range syntax. 108 """ 109 CopyAndStripWithOptionalMerge(s, t, True) 110 111 112_files = { 113 # Simply copy these files. 114 "BidiMirroring.txt": shutil.copy, 115 "BidiTest.txt": (shutil.copy, "testdata"), 116 "Blocks.txt": shutil.copy, 117 "CaseFolding.txt": shutil.copy, 118 "DerivedAge.txt": shutil.copy, 119 "DerivedBidiClass.txt": shutil.copy, 120 "DerivedJoiningGroup.txt": shutil.copy, 121 "DerivedJoiningType.txt": shutil.copy, 122 "DerivedNumericValues.txt": shutil.copy, 123 "GraphemeBreakTest.txt": (shutil.copy, "testdata"), 124 "LineBreakTest.txt": (shutil.copy, "testdata"), 125 "NameAliases.txt": shutil.copy, 126 "NormalizationCorrections.txt": shutil.copy, 127 "PropertyAliases.txt": shutil.copy, 128 "PropertyValueAliases.txt": shutil.copy, 129 "SentenceBreakTest.txt": (shutil.copy, "testdata"), 130 "ScriptExtensions.txt": shutil.copy, 131 "SpecialCasing.txt": shutil.copy, 132 "UnicodeData.txt": shutil.copy, 133 "WordBreakTest.txt": (shutil.copy, "testdata"), 134 135 # Copy these files and remove comments behind data lines but not in others. 136 "DerivedCoreProperties.txt": CopyAndStrip, 137 "DerivedNormalizationProps.txt": CopyAndStrip, 138 "GraphemeBreakProperty.txt": CopyAndStrip, 139 "NormalizationTest.txt": CopyAndStrip, 140 "PropList.txt": CopyAndStrip, 141 "Scripts.txt": CopyAndStrip, 142 "SentenceBreakProperty.txt": CopyAndStrip, 143 "WordBreakProperty.txt": CopyAndStrip, 144 145 # Also merge lines with adjacent code point ranges. 146 "EastAsianWidth.txt": CopyAndStripAndMerge, 147 "LineBreak.txt": CopyAndStripAndMerge 148} 149 150_file_version_re = re.compile("^([a-zA-Z0-9]+)" + 151 "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" + 152 "(\\.[a-z]+)$") 153 154def main(): 155 source_root = sys.argv[1] 156 dest_root = sys.argv[2] 157 source_files = [] 158 for root, dirs, files in os.walk(source_root): 159 for file in files: 160 source_files.append(os.path.join(root, file)) 161 files_processed = set() 162 for source_file in source_files: 163 basename = os.path.basename(source_file) 164 match = _file_version_re.match(basename) 165 if match: 166 basename = match.group(1) + match.group(2) 167 print basename 168 if basename in _files: 169 if basename in files_processed: 170 print "duplicate file basename %s!" % basename 171 sys.exit(1) 172 files_processed.add(basename) 173 action = _files[basename] 174 if isinstance(action, tuple): 175 dest_folder = action[1] 176 action = action[0] 177 else: 178 dest_folder = "unidata" 179 dest_path = os.path.join(dest_root, dest_folder) 180 if not os.path.exists(dest_path): os.makedirs(dest_path) 181 dest_file = os.path.join(dest_path, basename) 182 action(source_file, dest_file) 183 184 185if __name__ == "__main__": 186 main() 187