1 /* 2 * Copyright (C) 2009 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.tools.dict; 18 19 import org.xml.sax.Attributes; 20 import org.xml.sax.helpers.DefaultHandler; 21 22 import java.io.BufferedReader; 23 import java.io.BufferedWriter; 24 import java.io.File; 25 import java.io.FileInputStream; 26 import java.io.FileOutputStream; 27 import java.io.FileWriter; 28 import java.io.IOException; 29 import java.io.InputStreamReader; 30 import java.util.ArrayList; 31 import java.util.HashMap; 32 import java.util.List; 33 import java.util.Map; 34 35 import javax.xml.parsers.SAXParser; 36 import javax.xml.parsers.SAXParserFactory; 37 38 /** 39 * Compresses a list of words and frequencies into a tree structured binary dictionary. 40 */ 41 public class MakeBinaryDictionary { 42 43 public static final int ALPHA_SIZE = 256; 44 45 public static final String TAG_WORD = "w"; 46 public static final String ATTR_FREQ = "f"; 47 48 private static final int FLAG_ADDRESS_MASK = 0x400000; 49 private static final int FLAG_TERMINAL_MASK = 0x800000; 50 private static final int ADDRESS_MASK = 0x3FFFFF; 51 52 public static final CharNode EMPTY_NODE = new CharNode(); 53 54 List<CharNode> roots; 55 Map<String, Integer> mDictionary; 56 int mWordCount; 57 58 static class CharNode { 59 char data; 60 int freq; 61 boolean terminal; 62 List<CharNode> children; 63 static int sNodes; 64 CharNode()65 public CharNode() { 66 sNodes++; 67 } 68 } 69 usage()70 public static void usage() { 71 System.err.println("Usage: makedict <src.xml> <dest.dict>"); 72 System.exit(-1); 73 } 74 main(String[] args)75 public static void main(String[] args) { 76 if (args.length < 2) { 77 usage(); 78 } else { 79 new MakeBinaryDictionary(args[0], args[1]); 80 } 81 } 82 MakeBinaryDictionary(String srcFilename, String destFilename)83 public MakeBinaryDictionary(String srcFilename, String destFilename) { 84 populateDictionary(srcFilename); 85 writeToDict(destFilename); 86 // Enable the code below to verify that the generated tree is traversable. 87 if (false) { 88 traverseDict(0, new char[32], 0); 89 } 90 } 91 populateDictionary(String filename)92 private void populateDictionary(String filename) { 93 roots = new ArrayList<CharNode>(); 94 try { 95 SAXParser parser = SAXParserFactory.newInstance().newSAXParser(); 96 parser.parse(new File(filename), new DefaultHandler() { 97 boolean inWord; 98 int freq; 99 StringBuilder wordBuilder = new StringBuilder(48); 100 101 @Override 102 public void startElement(String uri, String localName, 103 String qName, Attributes attributes) { 104 if (qName.equals("w")) { 105 inWord = true; 106 freq = Integer.parseInt(attributes.getValue(0)); 107 wordBuilder.setLength(0); 108 } 109 } 110 111 @Override 112 public void characters(char[] data, int offset, int length) { 113 // Ignore other whitespace 114 if (!inWord) return; 115 wordBuilder.append(data, offset, length); 116 } 117 118 @Override 119 public void endElement(String uri, String localName, 120 String qName) { 121 if (qName.equals("w")) { 122 if (wordBuilder.length() > 1) { 123 addWordTop(wordBuilder.toString(), freq); 124 mWordCount++; 125 } 126 inWord = false; 127 } 128 } 129 }); 130 } catch (Exception ioe) { 131 System.err.println("Exception in parsing\n" + ioe); 132 ioe.printStackTrace(); 133 } 134 System.out.println("Nodes = " + CharNode.sNodes); 135 } 136 indexOf(List<CharNode> children, char c)137 private int indexOf(List<CharNode> children, char c) { 138 if (children == null) { 139 return -1; 140 } 141 for (int i = 0; i < children.size(); i++) { 142 if (children.get(i).data == c) { 143 return i; 144 } 145 } 146 return -1; 147 } 148 addWordTop(String word, int occur)149 private void addWordTop(String word, int occur) { 150 if (occur > 255) occur = 255; 151 char firstChar = word.charAt(0); 152 int index = indexOf(roots, firstChar); 153 if (index == -1) { 154 CharNode newNode = new CharNode(); 155 newNode.data = firstChar; 156 newNode.freq = occur; 157 index = roots.size(); 158 roots.add(newNode); 159 } else { 160 roots.get(index).freq += occur; 161 } 162 if (word.length() > 1) { 163 addWordRec(roots.get(index), word, 1, occur); 164 } else { 165 roots.get(index).terminal = true; 166 } 167 } 168 addWordRec(CharNode parent, String word, int charAt, int occur)169 private void addWordRec(CharNode parent, String word, int charAt, int occur) { 170 CharNode child = null; 171 char data = word.charAt(charAt); 172 if (parent.children == null) { 173 parent.children = new ArrayList<CharNode>(); 174 } else { 175 for (int i = 0; i < parent.children.size(); i++) { 176 CharNode node = parent.children.get(i); 177 if (node.data == data) { 178 child = node; 179 break; 180 } 181 } 182 } 183 if (child == null) { 184 child = new CharNode(); 185 parent.children.add(child); 186 } 187 child.data = data; 188 if (child.freq == 0) child.freq = occur; 189 if (word.length() > charAt + 1) { 190 addWordRec(child, word, charAt + 1, occur); 191 } else { 192 child.terminal = true; 193 child.freq = occur; 194 } 195 } 196 197 byte[] dict; 198 int dictSize; 199 static final int CHAR_WIDTH = 8; 200 static final int FLAGS_WIDTH = 1; // Terminal flag (word end) 201 static final int ADDR_WIDTH = 23; // Offset to children 202 static final int FREQ_WIDTH_BYTES = 1; 203 static final int COUNT_WIDTH_BYTES = 1; 204 addCount(int count)205 private void addCount(int count) { 206 dict[dictSize++] = (byte) (0xFF & count); 207 } 208 addNode(CharNode node)209 private void addNode(CharNode node) { 210 int charData = 0xFFFF & node.data; 211 if (charData > 254) { 212 dict[dictSize++] = (byte) 255; 213 dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF); 214 dict[dictSize++] = (byte) (node.data & 0xFF); 215 } else { 216 dict[dictSize++] = (byte) (0xFF & node.data); 217 } 218 if (node.children != null) { 219 dictSize += 3; // Space for children address 220 } else { 221 dictSize += 1; // Space for just the terminal/address flags 222 } 223 if ((0xFFFFFF & node.freq) > 255) { 224 node.freq = 255; 225 } 226 if (node.terminal) { 227 byte freq = (byte) (0xFF & node.freq); 228 dict[dictSize++] = freq; 229 } 230 } 231 232 int nullChildrenCount = 0; 233 int notTerminalCount = 0; 234 updateNodeAddress(int nodeAddress, CharNode node, int childrenAddress)235 private void updateNodeAddress(int nodeAddress, CharNode node, 236 int childrenAddress) { 237 if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character 238 nodeAddress += 2; 239 } 240 childrenAddress = ADDRESS_MASK & childrenAddress; 241 if (childrenAddress == 0) { 242 nullChildrenCount++; 243 } else { 244 childrenAddress |= FLAG_ADDRESS_MASK; 245 } 246 if (node.terminal) { 247 childrenAddress |= FLAG_TERMINAL_MASK; 248 } else { 249 notTerminalCount++; 250 } 251 dict[nodeAddress + 1] = (byte) (childrenAddress >> 16); 252 if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) { 253 dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8); 254 dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF)); 255 } 256 } 257 writeWordsRec(List<CharNode> children)258 void writeWordsRec(List<CharNode> children) { 259 if (children == null || children.size() == 0) { 260 return; 261 } 262 final int childCount = children.size(); 263 addCount(childCount); 264 //int childrenStart = dictSize; 265 int[] childrenAddresses = new int[childCount]; 266 for (int j = 0; j < childCount; j++) { 267 CharNode node = children.get(j); 268 childrenAddresses[j] = dictSize; 269 addNode(node); 270 } 271 for (int j = 0; j < childCount; j++) { 272 CharNode node = children.get(j); 273 int nodeAddress = childrenAddresses[j]; 274 int cacheDictSize = dictSize; 275 writeWordsRec(node.children); 276 updateNodeAddress(nodeAddress, node, node.children != null 277 ? cacheDictSize : 0); 278 } 279 } 280 writeToDict(String dictFilename)281 void writeToDict(String dictFilename) { 282 // 4MB max, 22-bit offsets 283 dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably 284 // < 1MB in most cases, as there is a limit in the 285 // resource size in apks. 286 dictSize = 0; 287 writeWordsRec(roots); 288 System.out.println("Dict Size = " + dictSize); 289 try { 290 FileOutputStream fos = new FileOutputStream(dictFilename); 291 fos.write(dict, 0, dictSize); 292 fos.close(); 293 } catch (IOException ioe) { 294 System.err.println("Error writing dict file:" + ioe); 295 } 296 } 297 traverseDict(int pos, char[] word, int depth)298 void traverseDict(int pos, char[] word, int depth) { 299 int count = dict[pos++] & 0xFF; 300 for (int i = 0; i < count; i++) { 301 char c = (char) (dict[pos++] & 0xFF); 302 if (c == 0xFF) { 303 c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF)); 304 pos += 2; 305 } 306 word[depth] = c; 307 boolean terminal = (dict[pos] & 0x80) > 0; 308 int address = 0; 309 if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) { 310 address = 311 ((dict[pos + 0] & (FLAG_ADDRESS_MASK >> 16)) << 16) 312 | ((dict[pos + 1] & 0xFF) << 8) 313 | ((dict[pos + 2] & 0xFF)); 314 pos += 2; 315 } 316 pos++; 317 if (terminal) { 318 showWord(word, depth + 1, dict[pos] & 0xFF); 319 pos++; 320 } 321 if (address != 0) { 322 traverseDict(address, word, depth + 1); 323 } 324 } 325 } 326 showWord(char[] word, int size, int freq)327 void showWord(char[] word, int size, int freq) { 328 System.out.print(new String(word, 0, size) + " " + freq + "\n"); 329 } 330 } 331