• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# Copyright (C) 2014 IBM Corporation and Others. All Rights Reserved.
4#
5# @author Steven R. Loomis <srl@icu-project.org>
6#
7# This tool slims down an ICU data (.dat) file according to a config file.
8#
9# See: http://bugs.icu-project.org/trac/ticket/10922
10#
11# Usage:
12#  Use "-h" to get help options.
13
14from __future__ import print_function
15
16import io
17import json
18import optparse
19import os
20import re
21import shutil
22import sys
23
24try:
25    # for utf-8 on Python 2
26    reload(sys)
27    sys.setdefaultencoding("utf-8")
28except NameError:
29    pass  # Python 3 already defaults to utf-8
30
31try:
32    basestring        # Python 2
33except NameError:
34    basestring = str  # Python 3
35
36endian=sys.byteorder
37
38parser = optparse.OptionParser(usage="usage: mkdir tmp ; %prog -D ~/Downloads/icudt53l.dat -T tmp -F trim_en.json -O icudt53l.dat" )
39
40parser.add_option("-P","--tool-path",
41                    action="store",
42                    dest="toolpath",
43                    help="set the prefix directory for ICU tools")
44
45parser.add_option("-D","--input-file",
46                    action="store",
47                    dest="datfile",
48                    help="input data file (icudt__.dat)",
49                    )  # required
50
51parser.add_option("-F","--filter-file",
52                    action="store",
53                    dest="filterfile",
54                    help="filter file (JSON format)",
55                    )  # required
56
57parser.add_option("-T","--tmp-dir",
58                    action="store",
59                    dest="tmpdir",
60                    help="working directory.",
61                    )  # required
62
63parser.add_option("--delete-tmp",
64                    action="count",
65                    dest="deltmpdir",
66                    help="delete working directory.",
67                    default=0)
68
69parser.add_option("-O","--outfile",
70                    action="store",
71                    dest="outfile",
72                    help="outfile  (NOT a full path)",
73                    )  # required
74
75parser.add_option("-v","--verbose",
76                    action="count",
77                    default=0)
78
79parser.add_option('-L',"--locales",
80                  action="store",
81                  dest="locales",
82                  help="sets the 'locales.only' variable",
83                  default=None)
84
85parser.add_option('-e', '--endian', action='store', dest='endian', help='endian, big, little or host, your default is "%s".' % endian, default=endian, metavar='endianness')
86
87(options, args) = parser.parse_args()
88
89optVars = vars(options)
90
91for opt in [ "datfile", "filterfile", "tmpdir", "outfile" ]:
92    if optVars[opt] is None:
93        print("Missing required option: %s" % opt)
94        sys.exit(1)
95
96if options.verbose>0:
97    print("Options: "+str(options))
98
99if (os.path.isdir(options.tmpdir) and options.deltmpdir):
100  if options.verbose>1:
101    print("Deleting tmp dir %s.." % (options.tmpdir))
102  shutil.rmtree(options.tmpdir)
103
104if not (os.path.isdir(options.tmpdir)):
105    os.mkdir(options.tmpdir)
106else:
107    print("Please delete tmpdir %s before beginning." % options.tmpdir)
108    sys.exit(1)
109
110if options.endian not in ("big","little","host"):
111    print("Unknown endianness: %s" % options.endian)
112    sys.exit(1)
113
114if options.endian == "host":
115    options.endian = endian
116
117if not os.path.isdir(options.tmpdir):
118    print("Error, tmpdir not a directory: %s" % (options.tmpdir))
119    sys.exit(1)
120
121if not os.path.isfile(options.filterfile):
122    print("Filterfile doesn't exist: %s" % (options.filterfile))
123    sys.exit(1)
124
125if not os.path.isfile(options.datfile):
126    print("Datfile doesn't exist: %s" % (options.datfile))
127    sys.exit(1)
128
129if not options.datfile.endswith(".dat"):
130    print("Datfile doesn't end with .dat: %s" % (options.datfile))
131    sys.exit(1)
132
133outfile = os.path.join(options.tmpdir, options.outfile)
134
135if os.path.isfile(outfile):
136    print("Error, output file does exist: %s" % (outfile))
137    sys.exit(1)
138
139if not options.outfile.endswith(".dat"):
140    print("Outfile doesn't end with .dat: %s" % (options.outfile))
141    sys.exit(1)
142
143dataname=options.outfile[0:-4]
144
145
146## TODO: need to improve this. Quotes, etc.
147def runcmd(tool, cmd, doContinue=False):
148    if(options.toolpath):
149        cmd = os.path.join(options.toolpath, tool) + " " + cmd
150    else:
151        cmd = tool + " " + cmd
152
153    if(options.verbose>4):
154        print("# " + cmd)
155
156    rc = os.system(cmd)
157    if rc != 0 and not doContinue:
158        print("FAILED: %s" % cmd)
159        sys.exit(1)
160    return rc
161
162## STEP 0 - read in json config
163with io.open(options.filterfile, encoding='utf-8') as fi:
164    config = json.load(fi)
165
166if options.locales:
167    config["variables"] = config.get("variables", {})
168    config["variables"]["locales"] = config["variables"].get("locales", {})
169    config["variables"]["locales"]["only"] = options.locales.split(',')
170
171if options.verbose > 6:
172    print(config)
173
174if "comment" in config:
175    print("%s: %s" % (options.filterfile, config["comment"]))
176
177## STEP 1 - copy the data file, swapping endianness
178## The first letter of endian_letter will be 'b' or 'l' for big or little
179endian_letter = options.endian[0]
180
181runcmd("icupkg", "-t%s %s %s""" % (endian_letter, options.datfile, outfile))
182
183## STEP 2 - get listing
184listfile = os.path.join(options.tmpdir,"icudata.lst")
185runcmd("icupkg", "-l %s > %s""" % (outfile, listfile))
186
187with open(listfile, 'rb') as fi:
188    items = [line.strip() for line in fi.read().decode("utf-8").splitlines()]
189itemset = set(items)
190
191if options.verbose > 1:
192    print("input file: %d items" % len(items))
193
194# list of all trees
195trees = {}
196RES_INDX = "res_index.res"
197remove = None
198# remove - always remove these
199if "remove" in config:
200    remove = set(config["remove"])
201else:
202    remove = set()
203
204# keep - always keep these
205if "keep" in config:
206    keep = set(config["keep"])
207else:
208    keep = set()
209
210def queueForRemoval(tree):
211    global remove
212    if tree not in config.get("trees", {}):
213        return
214    mytree = trees[tree]
215    if options.verbose > 0:
216        print("* %s: %d items" % (tree, len(mytree["locs"])))
217    # do varible substitution for this tree here
218    if isinstance(config["trees"][tree], basestring):
219        treeStr = config["trees"][tree]
220        if options.verbose > 5:
221            print(" Substituting $%s for tree %s" % (treeStr, tree))
222        if treeStr not in config.get("variables", {}):
223            print(" ERROR: no variable:  variables.%s for tree %s" % (treeStr, tree))
224            sys.exit(1)
225        config["trees"][tree] = config["variables"][treeStr]
226    myconfig = config["trees"][tree]
227    if options.verbose > 4:
228        print(" Config: %s" % (myconfig))
229    # Process this tree
230    if(len(myconfig)==0 or len(mytree["locs"])==0):
231        if(options.verbose>2):
232            print(" No processing for %s - skipping" % (tree))
233    else:
234        only = None
235        if "only" in myconfig:
236            only = set(myconfig["only"])
237            if (len(only)==0) and (mytree["treeprefix"] != ""):
238                thePool = "%spool.res" % (mytree["treeprefix"])
239                if (thePool in itemset):
240                    if(options.verbose>0):
241                        print("Removing %s because tree %s is empty." % (thePool, tree))
242                    remove.add(thePool)
243        else:
244            print("tree %s - no ONLY")
245        for l in range(len(mytree["locs"])):
246            loc = mytree["locs"][l]
247            if (only is not None) and not loc in only:
248                # REMOVE loc
249                toRemove = "%s%s%s" % (mytree["treeprefix"], loc, mytree["extension"])
250                if(options.verbose>6):
251                    print("Queueing for removal: %s" % toRemove)
252                remove.add(toRemove)
253
254def addTreeByType(tree, mytree):
255    if(options.verbose>1):
256        print("(considering %s): %s" % (tree, mytree))
257    trees[tree] = mytree
258    mytree["locs"]=[]
259    for i in range(len(items)):
260        item = items[i]
261        if item.startswith(mytree["treeprefix"]) and item.endswith(mytree["extension"]):
262            mytree["locs"].append(item[len(mytree["treeprefix"]):-4])
263    # now, process
264    queueForRemoval(tree)
265
266addTreeByType("converters",{"treeprefix":"", "extension":".cnv"})
267addTreeByType("stringprep",{"treeprefix":"", "extension":".spp"})
268addTreeByType("translit",{"treeprefix":"translit/", "extension":".res"})
269addTreeByType("brkfiles",{"treeprefix":"brkitr/", "extension":".brk"})
270addTreeByType("brkdict",{"treeprefix":"brkitr/", "extension":"dict"})
271addTreeByType("confusables",{"treeprefix":"", "extension":".cfu"})
272
273for i in range(len(items)):
274    item = items[i]
275    if item.endswith(RES_INDX):
276        treeprefix = item[0:item.rindex(RES_INDX)]
277        tree = None
278        if treeprefix == "":
279            tree = "ROOT"
280        else:
281            tree = treeprefix[0:-1]
282        if(options.verbose>6):
283            print("procesing %s" % (tree))
284        trees[tree] = { "extension": ".res", "treeprefix": treeprefix, "hasIndex": True }
285        # read in the resource list for the tree
286        treelistfile = os.path.join(options.tmpdir,"%s.lst" % tree)
287        runcmd("iculslocs", "-i %s -N %s -T %s -l > %s" % (outfile, dataname, tree, treelistfile))
288        with io.open(treelistfile, 'r', encoding='utf-8') as fi:
289            treeitems = fi.readlines()
290            trees[tree]["locs"] = [line.strip() for line in treeitems]
291        if tree not in config.get("trees", {}):
292            print(" Warning: filter file %s does not mention trees.%s - will be kept as-is" % (options.filterfile, tree))
293        else:
294            queueForRemoval(tree)
295
296def removeList(count=0):
297    # don't allow "keep" items to creep in here.
298    global remove
299    remove = remove - keep
300    if(count > 10):
301        print("Giving up - %dth attempt at removal." % count)
302        sys.exit(1)
303    if(options.verbose>1):
304        print("%d items to remove - try #%d" % (len(remove),count))
305    if(len(remove)>0):
306        oldcount = len(remove)
307        hackerrfile=os.path.join(options.tmpdir, "REMOVE.err")
308        removefile = os.path.join(options.tmpdir, "REMOVE.lst")
309        with open(removefile, 'wb') as fi:
310            fi.write('\n'.join(remove).encode("utf-8") + b'\n')
311        rc = runcmd("icupkg","-r %s %s 2> %s" %  (removefile,outfile,hackerrfile),True)
312        if rc != 0:
313            if(options.verbose>5):
314                print("## Damage control, trying to parse stderr from icupkg..")
315            fi = open(hackerrfile, 'rb')
316            erritems = fi.readlines()
317            fi.close()
318            #Item zone/zh_Hant_TW.res depends on missing item zone/zh_Hant.res
319            pat = re.compile(br"^Item ([^ ]+) depends on missing item ([^ ]+).*")
320            for i in range(len(erritems)):
321                line = erritems[i].strip()
322                m = pat.match(line)
323                if m:
324                    toDelete = m.group(1).decode("utf-8")
325                    if(options.verbose > 5):
326                        print("<< %s added to delete" % toDelete)
327                    remove.add(toDelete)
328                else:
329                    print("ERROR: could not match errline: %s" % line)
330                    sys.exit(1)
331            if(options.verbose > 5):
332                print(" now %d items to remove" % len(remove))
333            if(oldcount == len(remove)):
334                print(" ERROR: could not add any mor eitems to remove. Fail.")
335                sys.exit(1)
336            removeList(count+1)
337
338# fire it up
339removeList(1)
340
341# now, fixup res_index, one at a time
342for tree in trees:
343    # skip trees that don't have res_index
344    if "hasIndex" not in trees[tree]:
345        continue
346    treebunddir = options.tmpdir
347    if(trees[tree]["treeprefix"]):
348        treebunddir = os.path.join(treebunddir, trees[tree]["treeprefix"])
349    if not (os.path.isdir(treebunddir)):
350        os.mkdir(treebunddir)
351    treebundres = os.path.join(treebunddir,RES_INDX)
352    treebundtxt = "%s.txt" % (treebundres[0:-4])
353    runcmd("iculslocs", "-i %s -N %s -T %s -b %s" % (outfile, dataname, tree, treebundtxt))
354    runcmd("genrb","-d %s -s %s res_index.txt" % (treebunddir, treebunddir))
355    runcmd("icupkg","-s %s -a %s%s %s" % (options.tmpdir, trees[tree]["treeprefix"], RES_INDX, outfile))
356