• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.tools.dict;
18 
19 import org.xml.sax.Attributes;
20 import org.xml.sax.helpers.DefaultHandler;
21 
22 import java.io.BufferedReader;
23 import java.io.BufferedWriter;
24 import java.io.File;
25 import java.io.FileInputStream;
26 import java.io.FileOutputStream;
27 import java.io.FileWriter;
28 import java.io.IOException;
29 import java.io.InputStreamReader;
30 import java.util.ArrayList;
31 import java.util.HashMap;
32 import java.util.List;
33 import java.util.Map;
34 
35 import javax.xml.parsers.SAXParser;
36 import javax.xml.parsers.SAXParserFactory;
37 
38 /**
39  * Compresses a list of words and frequencies into a tree structured binary dictionary.
40  */
41 public class MakeBinaryDictionary {
42 
43     public static final int ALPHA_SIZE = 256;
44 
45     public static final String TAG_WORD = "w";
46     public static final String ATTR_FREQ = "f";
47 
48     private static final int FLAG_ADDRESS_MASK  = 0x400000;
49     private static final int FLAG_TERMINAL_MASK = 0x800000;
50     private static final int ADDRESS_MASK = 0x3FFFFF;
51 
52     public static final CharNode EMPTY_NODE = new CharNode();
53 
54     List<CharNode> roots;
55     Map<String, Integer> mDictionary;
56     int mWordCount;
57 
58     static class CharNode {
59         char data;
60         int freq;
61         boolean terminal;
62         List<CharNode> children;
63         static int sNodes;
64 
CharNode()65         public CharNode() {
66             sNodes++;
67         }
68     }
69 
usage()70     public static void usage() {
71         System.err.println("Usage: makedict <src.xml> <dest.dict>");
72         System.exit(-1);
73     }
74 
main(String[] args)75     public static void main(String[] args) {
76         if (args.length < 2) {
77             usage();
78         } else {
79             new MakeBinaryDictionary(args[0], args[1]);
80         }
81     }
82 
MakeBinaryDictionary(String srcFilename, String destFilename)83     public MakeBinaryDictionary(String srcFilename, String destFilename) {
84         populateDictionary(srcFilename);
85         writeToDict(destFilename);
86         // Enable the code below to verify that the generated tree is traversable.
87         if (false) {
88             traverseDict(0, new char[32], 0);
89         }
90     }
91 
populateDictionary(String filename)92     private void populateDictionary(String filename) {
93         roots = new ArrayList<CharNode>();
94         try {
95             SAXParser parser = SAXParserFactory.newInstance().newSAXParser();
96             parser.parse(new File(filename), new DefaultHandler() {
97                 boolean inWord;
98                 int freq;
99                 StringBuilder wordBuilder = new StringBuilder(48);
100 
101                 @Override
102                 public void startElement(String uri, String localName,
103                         String qName, Attributes attributes) {
104                     if (qName.equals("w")) {
105                         inWord = true;
106                         freq = Integer.parseInt(attributes.getValue(0));
107                         wordBuilder.setLength(0);
108                     }
109                 }
110 
111                 @Override
112                 public void characters(char[] data, int offset, int length) {
113                     // Ignore other whitespace
114                     if (!inWord) return;
115                     wordBuilder.append(data, offset, length);
116                 }
117 
118                 @Override
119                 public void endElement(String uri, String localName,
120                         String qName) {
121                     if (qName.equals("w")) {
122                         if (wordBuilder.length() > 1) {
123                             addWordTop(wordBuilder.toString(), freq);
124                             mWordCount++;
125                         }
126                         inWord = false;
127                     }
128                 }
129             });
130         } catch (Exception ioe) {
131             System.err.println("Exception in parsing\n" + ioe);
132             ioe.printStackTrace();
133         }
134         System.out.println("Nodes = " + CharNode.sNodes);
135     }
136 
indexOf(List<CharNode> children, char c)137     private int indexOf(List<CharNode> children, char c) {
138         if (children == null) {
139             return -1;
140         }
141         for (int i = 0; i < children.size(); i++) {
142             if (children.get(i).data == c) {
143                 return i;
144             }
145         }
146         return -1;
147     }
148 
addWordTop(String word, int occur)149     private void addWordTop(String word, int occur) {
150         if (occur > 255) occur = 255;
151         char firstChar = word.charAt(0);
152         int index = indexOf(roots, firstChar);
153         if (index == -1) {
154             CharNode newNode = new CharNode();
155             newNode.data = firstChar;
156             newNode.freq = occur;
157             index = roots.size();
158             roots.add(newNode);
159         } else {
160             roots.get(index).freq += occur;
161         }
162         if (word.length() > 1) {
163             addWordRec(roots.get(index), word, 1, occur);
164         } else {
165             roots.get(index).terminal = true;
166         }
167     }
168 
addWordRec(CharNode parent, String word, int charAt, int occur)169     private void addWordRec(CharNode parent, String word, int charAt, int occur) {
170         CharNode child = null;
171         char data = word.charAt(charAt);
172         if (parent.children == null) {
173             parent.children = new ArrayList<CharNode>();
174         } else {
175             for (int i = 0; i < parent.children.size(); i++) {
176                 CharNode node = parent.children.get(i);
177                 if (node.data == data) {
178                     child = node;
179                     break;
180                 }
181             }
182         }
183         if (child == null) {
184             child = new CharNode();
185             parent.children.add(child);
186         }
187         child.data = data;
188         if (child.freq == 0) child.freq = occur;
189         if (word.length() > charAt + 1) {
190             addWordRec(child, word, charAt + 1, occur);
191         } else {
192             child.terminal = true;
193             child.freq = occur;
194         }
195     }
196 
197     byte[] dict;
198     int dictSize;
199     static final int CHAR_WIDTH = 8;
200     static final int FLAGS_WIDTH = 1; // Terminal flag (word end)
201     static final int ADDR_WIDTH = 23; // Offset to children
202     static final int FREQ_WIDTH_BYTES = 1;
203     static final int COUNT_WIDTH_BYTES = 1;
204 
addCount(int count)205     private void addCount(int count) {
206         dict[dictSize++] = (byte) (0xFF & count);
207     }
208 
addNode(CharNode node)209     private void addNode(CharNode node) {
210         int charData = 0xFFFF & node.data;
211         if (charData > 254) {
212             dict[dictSize++] = (byte) 255;
213             dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF);
214             dict[dictSize++] = (byte) (node.data & 0xFF);
215         } else {
216             dict[dictSize++] = (byte) (0xFF & node.data);
217         }
218         if (node.children != null) {
219             dictSize += 3; // Space for children address
220         } else {
221             dictSize += 1; // Space for just the terminal/address flags
222         }
223         if ((0xFFFFFF & node.freq) > 255) {
224             node.freq = 255;
225         }
226         if (node.terminal) {
227             byte freq = (byte) (0xFF & node.freq);
228             dict[dictSize++] = freq;
229         }
230     }
231 
232     int nullChildrenCount = 0;
233     int notTerminalCount = 0;
234 
updateNodeAddress(int nodeAddress, CharNode node, int childrenAddress)235     private void updateNodeAddress(int nodeAddress, CharNode node,
236             int childrenAddress) {
237         if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character
238             nodeAddress += 2;
239         }
240         childrenAddress = ADDRESS_MASK & childrenAddress;
241         if (childrenAddress == 0) {
242             nullChildrenCount++;
243         } else {
244             childrenAddress |= FLAG_ADDRESS_MASK;
245         }
246         if (node.terminal) {
247             childrenAddress |= FLAG_TERMINAL_MASK;
248         } else {
249             notTerminalCount++;
250         }
251         dict[nodeAddress + 1] = (byte) (childrenAddress >> 16);
252         if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) {
253             dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8);
254             dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF));
255         }
256     }
257 
writeWordsRec(List<CharNode> children)258     void writeWordsRec(List<CharNode> children) {
259         if (children == null || children.size() == 0) {
260             return;
261         }
262         final int childCount = children.size();
263         addCount(childCount);
264         //int childrenStart = dictSize;
265         int[] childrenAddresses = new int[childCount];
266         for (int j = 0; j < childCount; j++) {
267             CharNode node = children.get(j);
268             childrenAddresses[j] = dictSize;
269             addNode(node);
270         }
271         for (int j = 0; j < childCount; j++) {
272             CharNode node = children.get(j);
273             int nodeAddress = childrenAddresses[j];
274             int cacheDictSize = dictSize;
275             writeWordsRec(node.children);
276             updateNodeAddress(nodeAddress, node, node.children != null
277                     ? cacheDictSize : 0);
278         }
279     }
280 
writeToDict(String dictFilename)281     void writeToDict(String dictFilename) {
282         // 4MB max, 22-bit offsets
283         dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably
284                                           // < 1MB in most cases, as there is a limit in the
285                                           // resource size in apks.
286         dictSize = 0;
287         writeWordsRec(roots);
288         System.out.println("Dict Size = " + dictSize);
289         try {
290             FileOutputStream fos = new FileOutputStream(dictFilename);
291             fos.write(dict, 0, dictSize);
292             fos.close();
293         } catch (IOException ioe) {
294             System.err.println("Error writing dict file:" + ioe);
295         }
296     }
297 
traverseDict(int pos, char[] word, int depth)298     void traverseDict(int pos, char[] word, int depth) {
299         int count = dict[pos++] & 0xFF;
300         for (int i = 0; i < count; i++) {
301             char c = (char) (dict[pos++] & 0xFF);
302             if (c == 0xFF) {
303                 c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF));
304                 pos += 2;
305             }
306             word[depth] = c;
307             boolean terminal = (dict[pos] & 0x80) > 0;
308             int address = 0;
309             if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) {
310                 address =
311                     ((dict[pos + 0] & (FLAG_ADDRESS_MASK >> 16)) << 16)
312                     | ((dict[pos + 1] & 0xFF) << 8)
313                     | ((dict[pos + 2] & 0xFF));
314                 pos += 2;
315             }
316             pos++;
317             if (terminal) {
318                 showWord(word, depth + 1, dict[pos] & 0xFF);
319                 pos++;
320             }
321             if (address != 0) {
322                 traverseDict(address, word, depth + 1);
323             }
324         }
325     }
326 
showWord(char[] word, int size, int freq)327     void showWord(char[] word, int size, int freq) {
328         System.out.print(new String(word, 0, size) + " " + freq + "\n");
329     }
330 }
331