#!/usr/bin/python2.4 # Copyright (c) 2009 International Business Machines # Corporation and others. All Rights Reserved. # # file name: ucdcopy.py # encoding: US-ASCII # tab size: 8 (not used) # indentation:4 # # created on: 2009aug04 # created by: Markus W. Scherer # # Copy Unicode Character Database (ucd) files from a tree # of files downloaded from ftp://www.unicode.org/Public/5.2.0/ # to a folder like ICU's source/data/unidata/ # and modify some of the files to make them more compact. # # Invoke with two command-line parameters, for the source # and destination folders. import os import os.path import re import shutil import sys _strip_re = re.compile("^([0-9a-fA-F]+.+?) *#.*") _code_point_re = re.compile("\s*([0-9a-fA-F]+)\s*;") def CopyAndStripWithOptionalMerge(s, t, do_merge): in_file = open(s, "r") out_file = open(t, "w") first = -1 # First code point with first_data. last = -1 # Last code point with first_data. first_data = "" # Common data for code points [first..last]. for line in in_file: match = _strip_re.match(line) if match: line = match.group(1) else: line = line.rstrip() if do_merge: match = _code_point_re.match(line) if match: c = int(match.group(1), 16) data = line[match.end() - 1:] else: c = -1 data = "" if last >= 0 and (c != (last + 1) or data != first_data): # output the current range if first == last: out_file.write("%04X%s\n" % (first, first_data)) else: out_file.write("%04X..%04X%s\n" % (first, last, first_data)) first = -1 last = -1 first_data = "" if c < 0: # no data on this line, output as is out_file.write(line) out_file.write("\n") else: # data on this line, store for possible range compaction if last < 0: # set as the first line in a possible range first = c last = c first_data = data else: # must be c == (last + 1) and data == first_data # because of previous conditions # continue with the current range last = c else: # Only strip, don't merge: just output the stripped line. out_file.write(line) out_file.write("\n") if do_merge and last >= 0: # output the last range in the file if first == last: out_file.write("%04X%s\n" % (first, first_data)) else: out_file.write("%04X..%04X%s\n" % (first, last, first_data)) first = -1 last = -1 first_data = "" in_file.close() out_file.flush() out_file.close() def CopyAndStrip(s, t): """Copies a file and removes comments behind data lines but not in others.""" CopyAndStripWithOptionalMerge(s, t, False) def CopyAndStripAndMerge(s, t): """Copies and strips a file and merges lines. Copies a file, removes comments, and merges lines with adjacent code point ranges and identical per-code point data lines into one line with range syntax. """ CopyAndStripWithOptionalMerge(s, t, True) _unidata_files = { # Simply copy these files. "BidiMirroring.txt": shutil.copy, "BidiTest.txt": shutil.copy, "Blocks.txt": shutil.copy, "CaseFolding.txt": shutil.copy, "DerivedAge.txt": shutil.copy, "DerivedBidiClass.txt": shutil.copy, "DerivedJoiningGroup.txt": shutil.copy, "DerivedJoiningType.txt": shutil.copy, "DerivedNumericValues.txt": shutil.copy, "NameAliases.txt": shutil.copy, "NormalizationCorrections.txt": shutil.copy, "PropertyAliases.txt": shutil.copy, "PropertyValueAliases.txt": shutil.copy, "SpecialCasing.txt": shutil.copy, "UnicodeData.txt": shutil.copy, # Copy these files and remove comments behind data lines but not in others. "DerivedCoreProperties.txt": CopyAndStrip, "DerivedNormalizationProps.txt": CopyAndStrip, "GraphemeBreakProperty.txt": CopyAndStrip, "NormalizationTest.txt": CopyAndStrip, "PropList.txt": CopyAndStrip, "Scripts.txt": CopyAndStrip, "SentenceBreakProperty.txt": CopyAndStrip, "WordBreakProperty.txt": CopyAndStrip, # Also merge lines with adjacent code point ranges. "EastAsianWidth.txt": CopyAndStripAndMerge, "LineBreak.txt": CopyAndStripAndMerge } _file_version_re = re.compile("^([a-zA-Z0-9]+)" + "-[0-9](?:\\.[0-9])*(?:d[0-9]+)?" + "(\\.[a-z]+)$") def main(): source_root = sys.argv[1] dest_root = sys.argv[2] source_files = [] for root, dirs, files in os.walk(source_root): for file in files: source_files.append(os.path.join(root, file)) files_processed = set() for source_file in source_files: basename = os.path.basename(source_file) match = _file_version_re.match(basename) if match: basename = match.group(1) + match.group(2) print basename if basename in _unidata_files: if basename in files_processed: print "duplicate file basename %s!" % basename sys.exit(1) files_processed.add(basename) dest_file = os.path.join(dest_root, basename) _unidata_files[basename](source_file, dest_file) if __name__ == "__main__": main()