1#!/usr/bin/python2.4 2# Copyright (c) 2009 International Business Machines 3# Corporation and others. All Rights Reserved. 4# 5# file name: ucdcopy.py 6# encoding: US-ASCII 7# tab size: 8 (not used) 8# indentation:4 9# 10# created on: 2009aug04 11# created by: Markus W. Scherer 12# 13# Copy Unicode Character Database (ucd) files from a tree 14# of files downloaded from ftp://www.unicode.org/Public/5.2.0/ 15# to a folder like ICU's source/data/unidata/ 16# and modify some of the files to make them more compact. 17# 18# Invoke with two command-line parameters, for the source 19# and destination folders. 20 21import os 22import os.path 23import re 24import shutil 25import sys 26 27_strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*") 28_code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;") 29 30def CopyAndStripWithOptionalMerge(s, t, do_merge): 31 in_file = open(s, "r") 32 out_file = open(t, "w") 33 first = -1 # First code point with first_data. 34 last = -1 # Last code point with first_data. 35 first_data = "" # Common data for code points [first..last]. 36 for line in in_file: 37 match = _strip_re.match(line) 38 if match: 39 line = match.group(1) 40 else: 41 line = line.rstrip() 42 if do_merge: 43 match = _code_point_re.match(line) 44 if match: 45 c = int(match.group(1), 16) 46 data = line[match.end() - 1:] 47 else: 48 c = -1 49 data = "" 50 if last >= 0 and (c != (last + 1) or data != first_data): 51 # output the current range 52 if first == last: 53 out_file.write("%04X%s\n" % (first, first_data)) 54 else: 55 out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 56 first = -1 57 last = -1 58 first_data = "" 59 if c < 0: 60 # no data on this line, output as is 61 out_file.write(line) 62 out_file.write("\n") 63 else: 64 # data on this line, store for possible range compaction 65 if last < 0: 66 # set as the first line in a possible range 67 first = c 68 last = c 69 first_data = data 70 else: 71 # must be c == (last + 1) and data == first_data 72 # because of previous conditions 73 # continue with the current range 74 last = c 75 else: 76 # Only strip, don't merge: just output the stripped line. 77 out_file.write(line) 78 out_file.write("\n") 79 if do_merge and last >= 0: 80 # output the last range in the file 81 if first == last: 82 out_file.write("%04X%s\n" % (first, first_data)) 83 else: 84 out_file.write("%04X..%04X%s\n" % (first, last, first_data)) 85 first = -1 86 last = -1 87 first_data = "" 88 in_file.close() 89 out_file.flush() 90 out_file.close() 91 92 93def CopyAndStrip(s, t): 94 """Copies a file and removes comments behind data lines but not in others.""" 95 CopyAndStripWithOptionalMerge(s, t, False) 96 97 98def CopyAndStripAndMerge(s, t): 99 """Copies and strips a file and merges lines. 100 101 Copies a file, removes comments, and 102 merges lines with adjacent code point ranges and identical per-code point 103 data lines into one line with range syntax. 104 """ 105 CopyAndStripWithOptionalMerge(s, t, True) 106 107 108_unidata_files = { 109 # Simply copy these files. 110 "BidiMirroring.txt": shutil.copy, 111 "BidiTest.txt": shutil.copy, 112 "Blocks.txt": shutil.copy, 113 "CaseFolding.txt": shutil.copy, 114 "DerivedAge.txt": shutil.copy, 115 "DerivedBidiClass.txt": shutil.copy, 116 "DerivedJoiningGroup.txt": shutil.copy, 117 "DerivedJoiningType.txt": shutil.copy, 118 "DerivedNumericValues.txt": shutil.copy, 119 "NameAliases.txt": shutil.copy, 120 "NormalizationCorrections.txt": shutil.copy, 121 "PropertyAliases.txt": shutil.copy, 122 "PropertyValueAliases.txt": shutil.copy, 123 "SpecialCasing.txt": shutil.copy, 124 "UnicodeData.txt": shutil.copy, 125 126 # Copy these files and remove comments behind data lines but not in others. 127 "DerivedCoreProperties.txt": CopyAndStrip, 128 "DerivedNormalizationProps.txt": CopyAndStrip, 129 "GraphemeBreakProperty.txt": CopyAndStrip, 130 "NormalizationTest.txt": CopyAndStrip, 131 "PropList.txt": CopyAndStrip, 132 "Scripts.txt": CopyAndStrip, 133 "SentenceBreakProperty.txt": CopyAndStrip, 134 "WordBreakProperty.txt": CopyAndStrip, 135 136 # Also merge lines with adjacent code point ranges. 137 "EastAsianWidth.txt": CopyAndStripAndMerge, 138 "LineBreak.txt": CopyAndStripAndMerge 139} 140 141_file_version_re = re.compile("^([a-zA-Z0-9]+)" + 142 "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" + 143 "(\\.[a-z]+)$") 144 145def main(): 146 source_root = sys.argv[1] 147 dest_root = sys.argv[2] 148 source_files = [] 149 for root, dirs, files in os.walk(source_root): 150 for file in files: 151 source_files.append(os.path.join(root, file)) 152 files_processed = set() 153 for source_file in source_files: 154 basename = os.path.basename(source_file) 155 match = _file_version_re.match(basename) 156 if match: 157 basename = match.group(1) + match.group(2) 158 print basename 159 if basename in _unidata_files: 160 if basename in files_processed: 161 print "duplicate file basename %s!" % basename 162 sys.exit(1) 163 files_processed.add(basename) 164 dest_file = os.path.join(dest_root, basename) 165 _unidata_files[basename](source_file, dest_file) 166 167 168if __name__ == "__main__": 169 main() 170