1#!/usr/bin/python 2# 3# Copyright (C) 2014 IBM Corporation and Others. All Rights Reserved. 4# 5# @author Steven R. Loomis <srl@icu-project.org> 6# 7# This tool slims down an ICU data (.dat) file according to a config file. 8# 9# See: http://bugs.icu-project.org/trac/ticket/10922 10# 11# Usage: 12# Use "-h" to get help options. 13 14from __future__ import print_function 15 16import io 17import json 18import optparse 19import os 20import re 21import shutil 22import sys 23 24try: 25 # for utf-8 on Python 2 26 reload(sys) 27 sys.setdefaultencoding("utf-8") 28except NameError: 29 pass # Python 3 already defaults to utf-8 30 31try: 32 basestring # Python 2 33except NameError: 34 basestring = str # Python 3 35 36endian=sys.byteorder 37 38parser = optparse.OptionParser(usage="usage: mkdir tmp ; %prog -D ~/Downloads/icudt53l.dat -T tmp -F trim_en.json -O icudt53l.dat" ) 39 40parser.add_option("-P","--tool-path", 41 action="store", 42 dest="toolpath", 43 help="set the prefix directory for ICU tools") 44 45parser.add_option("-D","--input-file", 46 action="store", 47 dest="datfile", 48 help="input data file (icudt__.dat)", 49 ) # required 50 51parser.add_option("-F","--filter-file", 52 action="store", 53 dest="filterfile", 54 help="filter file (JSON format)", 55 ) # required 56 57parser.add_option("-T","--tmp-dir", 58 action="store", 59 dest="tmpdir", 60 help="working directory.", 61 ) # required 62 63parser.add_option("--delete-tmp", 64 action="count", 65 dest="deltmpdir", 66 help="delete working directory.", 67 default=0) 68 69parser.add_option("-O","--outfile", 70 action="store", 71 dest="outfile", 72 help="outfile (NOT a full path)", 73 ) # required 74 75parser.add_option("-v","--verbose", 76 action="count", 77 default=0) 78 79parser.add_option('-L',"--locales", 80 action="store", 81 dest="locales", 82 help="sets the 'locales.only' variable", 83 default=None) 84 85parser.add_option('-e', '--endian', action='store', dest='endian', help='endian, big, little or host, your default is "%s".' % endian, default=endian, metavar='endianness') 86 87(options, args) = parser.parse_args() 88 89optVars = vars(options) 90 91for opt in [ "datfile", "filterfile", "tmpdir", "outfile" ]: 92 if optVars[opt] is None: 93 print("Missing required option: %s" % opt) 94 sys.exit(1) 95 96if options.verbose>0: 97 print("Options: "+str(options)) 98 99if (os.path.isdir(options.tmpdir) and options.deltmpdir): 100 if options.verbose>1: 101 print("Deleting tmp dir %s.." % (options.tmpdir)) 102 shutil.rmtree(options.tmpdir) 103 104if not (os.path.isdir(options.tmpdir)): 105 os.mkdir(options.tmpdir) 106else: 107 print("Please delete tmpdir %s before beginning." % options.tmpdir) 108 sys.exit(1) 109 110if options.endian not in ("big","little","host"): 111 print("Unknown endianness: %s" % options.endian) 112 sys.exit(1) 113 114if options.endian == "host": 115 options.endian = endian 116 117if not os.path.isdir(options.tmpdir): 118 print("Error, tmpdir not a directory: %s" % (options.tmpdir)) 119 sys.exit(1) 120 121if not os.path.isfile(options.filterfile): 122 print("Filterfile doesn't exist: %s" % (options.filterfile)) 123 sys.exit(1) 124 125if not os.path.isfile(options.datfile): 126 print("Datfile doesn't exist: %s" % (options.datfile)) 127 sys.exit(1) 128 129if not options.datfile.endswith(".dat"): 130 print("Datfile doesn't end with .dat: %s" % (options.datfile)) 131 sys.exit(1) 132 133outfile = os.path.join(options.tmpdir, options.outfile) 134 135if os.path.isfile(outfile): 136 print("Error, output file does exist: %s" % (outfile)) 137 sys.exit(1) 138 139if not options.outfile.endswith(".dat"): 140 print("Outfile doesn't end with .dat: %s" % (options.outfile)) 141 sys.exit(1) 142 143dataname=options.outfile[0:-4] 144 145 146## TODO: need to improve this. Quotes, etc. 147def runcmd(tool, cmd, doContinue=False): 148 if(options.toolpath): 149 cmd = os.path.join(options.toolpath, tool) + " " + cmd 150 else: 151 cmd = tool + " " + cmd 152 153 if(options.verbose>4): 154 print("# " + cmd) 155 156 rc = os.system(cmd) 157 if rc != 0 and not doContinue: 158 print("FAILED: %s" % cmd) 159 sys.exit(1) 160 return rc 161 162## STEP 0 - read in json config 163with io.open(options.filterfile, encoding='utf-8') as fi: 164 config = json.load(fi) 165 166if options.locales: 167 config["variables"] = config.get("variables", {}) 168 config["variables"]["locales"] = config["variables"].get("locales", {}) 169 config["variables"]["locales"]["only"] = options.locales.split(',') 170 171if options.verbose > 6: 172 print(config) 173 174if "comment" in config: 175 print("%s: %s" % (options.filterfile, config["comment"])) 176 177## STEP 1 - copy the data file, swapping endianness 178## The first letter of endian_letter will be 'b' or 'l' for big or little 179endian_letter = options.endian[0] 180 181runcmd("icupkg", "-t%s %s %s""" % (endian_letter, options.datfile, outfile)) 182 183## STEP 2 - get listing 184listfile = os.path.join(options.tmpdir,"icudata.lst") 185runcmd("icupkg", "-l %s > %s""" % (outfile, listfile)) 186 187with open(listfile, 'rb') as fi: 188 items = [line.strip() for line in fi.read().decode("utf-8").splitlines()] 189itemset = set(items) 190 191if options.verbose > 1: 192 print("input file: %d items" % len(items)) 193 194# list of all trees 195trees = {} 196RES_INDX = "res_index.res" 197remove = None 198# remove - always remove these 199if "remove" in config: 200 remove = set(config["remove"]) 201else: 202 remove = set() 203 204# keep - always keep these 205if "keep" in config: 206 keep = set(config["keep"]) 207else: 208 keep = set() 209 210def queueForRemoval(tree): 211 global remove 212 if tree not in config.get("trees", {}): 213 return 214 mytree = trees[tree] 215 if options.verbose > 0: 216 print("* %s: %d items" % (tree, len(mytree["locs"]))) 217 # do varible substitution for this tree here 218 if isinstance(config["trees"][tree], basestring): 219 treeStr = config["trees"][tree] 220 if options.verbose > 5: 221 print(" Substituting $%s for tree %s" % (treeStr, tree)) 222 if treeStr not in config.get("variables", {}): 223 print(" ERROR: no variable: variables.%s for tree %s" % (treeStr, tree)) 224 sys.exit(1) 225 config["trees"][tree] = config["variables"][treeStr] 226 myconfig = config["trees"][tree] 227 if options.verbose > 4: 228 print(" Config: %s" % (myconfig)) 229 # Process this tree 230 if(len(myconfig)==0 or len(mytree["locs"])==0): 231 if(options.verbose>2): 232 print(" No processing for %s - skipping" % (tree)) 233 else: 234 only = None 235 if "only" in myconfig: 236 only = set(myconfig["only"]) 237 if (len(only)==0) and (mytree["treeprefix"] != ""): 238 thePool = "%spool.res" % (mytree["treeprefix"]) 239 if (thePool in itemset): 240 if(options.verbose>0): 241 print("Removing %s because tree %s is empty." % (thePool, tree)) 242 remove.add(thePool) 243 else: 244 print("tree %s - no ONLY") 245 for l in range(len(mytree["locs"])): 246 loc = mytree["locs"][l] 247 if (only is not None) and not loc in only: 248 # REMOVE loc 249 toRemove = "%s%s%s" % (mytree["treeprefix"], loc, mytree["extension"]) 250 if(options.verbose>6): 251 print("Queueing for removal: %s" % toRemove) 252 remove.add(toRemove) 253 254def addTreeByType(tree, mytree): 255 if(options.verbose>1): 256 print("(considering %s): %s" % (tree, mytree)) 257 trees[tree] = mytree 258 mytree["locs"]=[] 259 for i in range(len(items)): 260 item = items[i] 261 if item.startswith(mytree["treeprefix"]) and item.endswith(mytree["extension"]): 262 mytree["locs"].append(item[len(mytree["treeprefix"]):-4]) 263 # now, process 264 queueForRemoval(tree) 265 266addTreeByType("converters",{"treeprefix":"", "extension":".cnv"}) 267addTreeByType("stringprep",{"treeprefix":"", "extension":".spp"}) 268addTreeByType("translit",{"treeprefix":"translit/", "extension":".res"}) 269addTreeByType("brkfiles",{"treeprefix":"brkitr/", "extension":".brk"}) 270addTreeByType("brkdict",{"treeprefix":"brkitr/", "extension":"dict"}) 271addTreeByType("confusables",{"treeprefix":"", "extension":".cfu"}) 272 273for i in range(len(items)): 274 item = items[i] 275 if item.endswith(RES_INDX): 276 treeprefix = item[0:item.rindex(RES_INDX)] 277 tree = None 278 if treeprefix == "": 279 tree = "ROOT" 280 else: 281 tree = treeprefix[0:-1] 282 if(options.verbose>6): 283 print("procesing %s" % (tree)) 284 trees[tree] = { "extension": ".res", "treeprefix": treeprefix, "hasIndex": True } 285 # read in the resource list for the tree 286 treelistfile = os.path.join(options.tmpdir,"%s.lst" % tree) 287 runcmd("iculslocs", "-i %s -N %s -T %s -l > %s" % (outfile, dataname, tree, treelistfile)) 288 with io.open(treelistfile, 'r', encoding='utf-8') as fi: 289 treeitems = fi.readlines() 290 trees[tree]["locs"] = [line.strip() for line in treeitems] 291 if tree not in config.get("trees", {}): 292 print(" Warning: filter file %s does not mention trees.%s - will be kept as-is" % (options.filterfile, tree)) 293 else: 294 queueForRemoval(tree) 295 296def removeList(count=0): 297 # don't allow "keep" items to creep in here. 298 global remove 299 remove = remove - keep 300 if(count > 10): 301 print("Giving up - %dth attempt at removal." % count) 302 sys.exit(1) 303 if(options.verbose>1): 304 print("%d items to remove - try #%d" % (len(remove),count)) 305 if(len(remove)>0): 306 oldcount = len(remove) 307 hackerrfile=os.path.join(options.tmpdir, "REMOVE.err") 308 removefile = os.path.join(options.tmpdir, "REMOVE.lst") 309 with open(removefile, 'wb') as fi: 310 fi.write('\n'.join(remove).encode("utf-8") + b'\n') 311 rc = runcmd("icupkg","-r %s %s 2> %s" % (removefile,outfile,hackerrfile),True) 312 if rc != 0: 313 if(options.verbose>5): 314 print("## Damage control, trying to parse stderr from icupkg..") 315 fi = open(hackerrfile, 'rb') 316 erritems = fi.readlines() 317 fi.close() 318 #Item zone/zh_Hant_TW.res depends on missing item zone/zh_Hant.res 319 pat = re.compile(br"^Item ([^ ]+) depends on missing item ([^ ]+).*") 320 for i in range(len(erritems)): 321 line = erritems[i].strip() 322 m = pat.match(line) 323 if m: 324 toDelete = m.group(1).decode("utf-8") 325 if(options.verbose > 5): 326 print("<< %s added to delete" % toDelete) 327 remove.add(toDelete) 328 else: 329 print("ERROR: could not match errline: %s" % line) 330 sys.exit(1) 331 if(options.verbose > 5): 332 print(" now %d items to remove" % len(remove)) 333 if(oldcount == len(remove)): 334 print(" ERROR: could not add any mor eitems to remove. Fail.") 335 sys.exit(1) 336 removeList(count+1) 337 338# fire it up 339removeList(1) 340 341# now, fixup res_index, one at a time 342for tree in trees: 343 # skip trees that don't have res_index 344 if "hasIndex" not in trees[tree]: 345 continue 346 treebunddir = options.tmpdir 347 if(trees[tree]["treeprefix"]): 348 treebunddir = os.path.join(treebunddir, trees[tree]["treeprefix"]) 349 if not (os.path.isdir(treebunddir)): 350 os.mkdir(treebunddir) 351 treebundres = os.path.join(treebunddir,RES_INDX) 352 treebundtxt = "%s.txt" % (treebundres[0:-4]) 353 runcmd("iculslocs", "-i %s -N %s -T %s -b %s" % (outfile, dataname, tree, treebundtxt)) 354 runcmd("genrb","-d %s -s %s res_index.txt" % (treebunddir, treebunddir)) 355 runcmd("icupkg","-s %s -a %s%s %s" % (options.tmpdir, trees[tree]["treeprefix"], RES_INDX, outfile)) 356