• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5  * use this file except in compliance with the License. You may obtain a copy of
6  * the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13  * License for the specific language governing permissions and limitations under
14  * the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
20 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
21 import com.android.inputmethod.latin.makedict.FusionDictionary.Node;
22 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
23 
24 import java.io.ByteArrayOutputStream;
25 import java.io.FileNotFoundException;
26 import java.io.IOException;
27 import java.io.OutputStream;
28 import java.io.RandomAccessFile;
29 import java.util.ArrayList;
30 import java.util.Arrays;
31 import java.util.HashMap;
32 import java.util.Iterator;
33 import java.util.Map;
34 import java.util.TreeMap;
35 
36 /**
37  * Reads and writes XML files for a FusionDictionary.
38  *
39  * All the methods in this class are static.
40  */
41 public class BinaryDictInputOutput {
42 
43     final static boolean DBG = MakedictLog.DBG;
44 
45     /* Node layout is as follows:
46      *   | addressType                         xx     : mask with MASK_GROUP_ADDRESS_TYPE
47      *                                 2 bits, 00 = no children : FLAG_GROUP_ADDRESS_TYPE_NOADDRESS
48      * f |                                     01 = 1 byte      : FLAG_GROUP_ADDRESS_TYPE_ONEBYTE
49      * l |                                     10 = 2 bytes     : FLAG_GROUP_ADDRESS_TYPE_TWOBYTES
50      * a |                                     11 = 3 bytes     : FLAG_GROUP_ADDRESS_TYPE_THREEBYTES
51      * g | has several chars ?         1 bit, 1 = yes, 0 = no   : FLAG_HAS_MULTIPLE_CHARS
52      * s | has a terminal ?            1 bit, 1 = yes, 0 = no   : FLAG_IS_TERMINAL
53      *   | has shortcut targets ?      1 bit, 1 = yes, 0 = no   : FLAG_HAS_SHORTCUT_TARGETS
54      *   | has bigrams ?               1 bit, 1 = yes, 0 = no   : FLAG_HAS_BIGRAMS
55      *
56      * c | IF FLAG_HAS_MULTIPLE_CHARS
57      * h |   char, char, char, char    n * (1 or 3 bytes) : use CharGroupInfo for i/o helpers
58      * a |   end                       1 byte, = 0
59      * r | ELSE
60      * s |   char                      1 or 3 bytes
61      *   | END
62      *
63      * f |
64      * r | IF FLAG_IS_TERMINAL
65      * e |   frequency                 1 byte
66      * q |
67      *
68      * c | IF 00 = FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = addressType
69      * h |   // nothing
70      * i | ELSIF 01 = FLAG_GROUP_ADDRESS_TYPE_ONEBYTE == addressType
71      * l |   children address, 1 byte
72      * d | ELSIF 10 = FLAG_GROUP_ADDRESS_TYPE_TWOBYTES == addressType
73      * r |   children address, 2 bytes
74      * e | ELSE // 11 = FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = addressType
75      * n |   children address, 3 bytes
76      * A | END
77      * d
78      * dress
79      *
80      *   | IF FLAG_IS_TERMINAL && FLAG_HAS_SHORTCUT_TARGETS
81      *   | shortcut string list
82      *   | IF FLAG_IS_TERMINAL && FLAG_HAS_BIGRAMS
83      *   | bigrams address list
84      *
85      * Char format is:
86      * 1 byte = bbbbbbbb match
87      * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
88      * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
89      *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
90      *       00011111 would be outside unicode.
91      * else: iso-latin-1 code
92      * This allows for the whole unicode range to be encoded, including chars outside of
93      * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
94      * characters which should never happen anyway (and still work, but take 3 bytes).
95      *
96      * bigram address list is:
97      * <flags> = | hasNext = 1 bit, 1 = yes, 0 = no     : FLAG_ATTRIBUTE_HAS_NEXT
98      *           | addressSign = 1 bit,                 : FLAG_ATTRIBUTE_OFFSET_NEGATIVE
99      *           |                      1 = must take -address, 0 = must take +address
100      *           |                         xx : mask with MASK_ATTRIBUTE_ADDRESS_TYPE
101      *           | addressFormat = 2 bits, 00 = unused  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
102      *           |                         01 = 1 byte  : FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE
103      *           |                         10 = 2 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES
104      *           |                         11 = 3 bytes : FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES
105      *           | 4 bits : frequency         : mask with FLAG_ATTRIBUTE_FREQUENCY
106      * <address> | IF (01 == FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE == addressFormat)
107      *           |   read 1 byte, add top 4 bits
108      *           | ELSIF (10 == FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES == addressFormat)
109      *           |   read 2 bytes, add top 4 bits
110      *           | ELSE // 11 == FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES == addressFormat
111      *           |   read 3 bytes, add top 4 bits
112      *           | END
113      *           | if (FLAG_ATTRIBUTE_OFFSET_NEGATIVE) then address = -address
114      * if (FLAG_ATTRIBUTE_HAS_NEXT) goto bigram_and_shortcut_address_list_is
115      *
116      * shortcut string list is:
117      * <byte size> = GROUP_SHORTCUT_LIST_SIZE_SIZE bytes, big-endian: size of the list, in bytes.
118      * <flags>     = | hasNext = 1 bit, 1 = yes, 0 = no : FLAG_ATTRIBUTE_HAS_NEXT
119      *               | reserved = 3 bits, must be 0
120      *               | 4 bits : frequency : mask with FLAG_ATTRIBUTE_FREQUENCY
121      * <shortcut>  = | string of characters at the char format described above, with the terminator
122      *               | used to signal the end of the string.
123      * if (FLAG_ATTRIBUTE_HAS_NEXT goto flags
124      */
125 
126     private static final int VERSION_1_MAGIC_NUMBER = 0x78B1;
127     private static final int VERSION_2_MAGIC_NUMBER = 0x9BC13AFE;
128     private static final int MINIMUM_SUPPORTED_VERSION = 1;
129     private static final int MAXIMUM_SUPPORTED_VERSION = 2;
130     private static final int NOT_A_VERSION_NUMBER = -1;
131     private static final int FIRST_VERSION_WITH_HEADER_SIZE = 2;
132 
133     // These options need to be the same numeric values as the one in the native reading code.
134     private static final int GERMAN_UMLAUT_PROCESSING_FLAG = 0x1;
135     private static final int FRENCH_LIGATURE_PROCESSING_FLAG = 0x4;
136     private static final int CONTAINS_BIGRAMS_FLAG = 0x8;
137 
138     // TODO: Make this value adaptative to content data, store it in the header, and
139     // use it in the reading code.
140     private static final int MAX_WORD_LENGTH = 48;
141 
142     private static final int MASK_GROUP_ADDRESS_TYPE = 0xC0;
143     private static final int FLAG_GROUP_ADDRESS_TYPE_NOADDRESS = 0x00;
144     private static final int FLAG_GROUP_ADDRESS_TYPE_ONEBYTE = 0x40;
145     private static final int FLAG_GROUP_ADDRESS_TYPE_TWOBYTES = 0x80;
146     private static final int FLAG_GROUP_ADDRESS_TYPE_THREEBYTES = 0xC0;
147 
148     private static final int FLAG_HAS_MULTIPLE_CHARS = 0x20;
149 
150     private static final int FLAG_IS_TERMINAL = 0x10;
151     private static final int FLAG_HAS_SHORTCUT_TARGETS = 0x08;
152     private static final int FLAG_HAS_BIGRAMS = 0x04;
153 
154     private static final int FLAG_ATTRIBUTE_HAS_NEXT = 0x80;
155     private static final int FLAG_ATTRIBUTE_OFFSET_NEGATIVE = 0x40;
156     private static final int MASK_ATTRIBUTE_ADDRESS_TYPE = 0x30;
157     private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE = 0x10;
158     private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES = 0x20;
159     private static final int FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES = 0x30;
160     private static final int FLAG_ATTRIBUTE_FREQUENCY = 0x0F;
161 
162     private static final int GROUP_CHARACTERS_TERMINATOR = 0x1F;
163 
164     private static final int GROUP_TERMINATOR_SIZE = 1;
165     private static final int GROUP_FLAGS_SIZE = 1;
166     private static final int GROUP_FREQUENCY_SIZE = 1;
167     private static final int GROUP_MAX_ADDRESS_SIZE = 3;
168     private static final int GROUP_ATTRIBUTE_FLAGS_SIZE = 1;
169     private static final int GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE = 3;
170     private static final int GROUP_SHORTCUT_LIST_SIZE_SIZE = 2;
171 
172     private static final int NO_CHILDREN_ADDRESS = Integer.MIN_VALUE;
173     private static final int INVALID_CHARACTER = -1;
174 
175     private static final int MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT = 0x7F; // 127
176     private static final int MAX_CHARGROUPS_IN_A_NODE = 0x7FFF; // 32767
177 
178     private static final int MAX_TERMINAL_FREQUENCY = 255;
179     private static final int MAX_BIGRAM_FREQUENCY = 15;
180 
181     // Arbitrary limit to how much passes we consider address size compression should
182     // terminate in. At the time of this writing, our largest dictionary completes
183     // compression in five passes.
184     // If the number of passes exceeds this number, makedict bails with an exception on
185     // suspicion that a bug might be causing an infinite loop.
186     private static final int MAX_PASSES = 24;
187 
188     /**
189      * A class grouping utility function for our specific character encoding.
190      */
191     private static class CharEncoding {
192 
193         private static final int MINIMAL_ONE_BYTE_CHARACTER_VALUE = 0x20;
194         private static final int MAXIMAL_ONE_BYTE_CHARACTER_VALUE = 0xFF;
195 
196         /**
197          * Helper method to find out whether this code fits on one byte
198          */
fitsOnOneByte(int character)199         private static boolean fitsOnOneByte(int character) {
200             return character >= MINIMAL_ONE_BYTE_CHARACTER_VALUE
201                     && character <= MAXIMAL_ONE_BYTE_CHARACTER_VALUE;
202         }
203 
204         /**
205          * Compute the size of a character given its character code.
206          *
207          * Char format is:
208          * 1 byte = bbbbbbbb match
209          * case 000xxxxx: xxxxx << 16 + next byte << 8 + next byte
210          * else: if 00011111 (= 0x1F) : this is the terminator. This is a relevant choice because
211          *       unicode code points range from 0 to 0x10FFFF, so any 3-byte value starting with
212          *       00011111 would be outside unicode.
213          * else: iso-latin-1 code
214          * This allows for the whole unicode range to be encoded, including chars outside of
215          * the BMP. Also everything in the iso-latin-1 charset is only 1 byte, except control
216          * characters which should never happen anyway (and still work, but take 3 bytes).
217          *
218          * @param character the character code.
219          * @return the size in binary encoded-form, either 1 or 3 bytes.
220          */
getCharSize(int character)221         private static int getCharSize(int character) {
222             // See char encoding in FusionDictionary.java
223             if (fitsOnOneByte(character)) return 1;
224             if (INVALID_CHARACTER == character) return 1;
225             return 3;
226         }
227 
228         /**
229          * Compute the byte size of a character array.
230          */
getCharArraySize(final int[] chars)231         private static int getCharArraySize(final int[] chars) {
232             int size = 0;
233             for (int character : chars) size += getCharSize(character);
234             return size;
235         }
236 
237         /**
238          * Writes a char array to a byte buffer.
239          *
240          * @param codePoints the code point array to write.
241          * @param buffer the byte buffer to write to.
242          * @param index the index in buffer to write the character array to.
243          * @return the index after the last character.
244          */
writeCharArray(final int[] codePoints, final byte[] buffer, int index)245         private static int writeCharArray(final int[] codePoints, final byte[] buffer, int index) {
246             for (int codePoint : codePoints) {
247                 if (1 == getCharSize(codePoint)) {
248                     buffer[index++] = (byte)codePoint;
249                 } else {
250                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
251                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
252                     buffer[index++] = (byte)(0xFF & codePoint);
253                 }
254             }
255             return index;
256         }
257 
258         /**
259          * Writes a string with our character format to a byte buffer.
260          *
261          * This will also write the terminator byte.
262          *
263          * @param buffer the byte buffer to write to.
264          * @param origin the offset to write from.
265          * @param word the string to write.
266          * @return the size written, in bytes.
267          */
writeString(final byte[] buffer, final int origin, final String word)268         private static int writeString(final byte[] buffer, final int origin,
269                 final String word) {
270             final int length = word.length();
271             int index = origin;
272             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
273                 final int codePoint = word.codePointAt(i);
274                 if (1 == getCharSize(codePoint)) {
275                     buffer[index++] = (byte)codePoint;
276                 } else {
277                     buffer[index++] = (byte)(0xFF & (codePoint >> 16));
278                     buffer[index++] = (byte)(0xFF & (codePoint >> 8));
279                     buffer[index++] = (byte)(0xFF & codePoint);
280                 }
281             }
282             buffer[index++] = GROUP_CHARACTERS_TERMINATOR;
283             return index - origin;
284         }
285 
286         /**
287          * Writes a string with our character format to a ByteArrayOutputStream.
288          *
289          * This will also write the terminator byte.
290          *
291          * @param buffer the ByteArrayOutputStream to write to.
292          * @param word the string to write.
293          */
writeString(ByteArrayOutputStream buffer, final String word)294         private static void writeString(ByteArrayOutputStream buffer, final String word) {
295             final int length = word.length();
296             for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
297                 final int codePoint = word.codePointAt(i);
298                 if (1 == getCharSize(codePoint)) {
299                     buffer.write((byte) codePoint);
300                 } else {
301                     buffer.write((byte) (0xFF & (codePoint >> 16)));
302                     buffer.write((byte) (0xFF & (codePoint >> 8)));
303                     buffer.write((byte) (0xFF & codePoint));
304                 }
305             }
306             buffer.write(GROUP_CHARACTERS_TERMINATOR);
307         }
308 
309         /**
310          * Reads a string from a RandomAccessFile. This is the converse of the above method.
311          */
readString(final RandomAccessFile source)312         private static String readString(final RandomAccessFile source) throws IOException {
313             final StringBuilder s = new StringBuilder();
314             int character = readChar(source);
315             while (character != INVALID_CHARACTER) {
316                 s.appendCodePoint(character);
317                 character = readChar(source);
318             }
319             return s.toString();
320         }
321 
322         /**
323          * Reads a character from the file.
324          *
325          * This follows the character format documented earlier in this source file.
326          *
327          * @param source the file, positioned over an encoded character.
328          * @return the character code.
329          */
readChar(RandomAccessFile source)330         private static int readChar(RandomAccessFile source) throws IOException {
331             int character = source.readUnsignedByte();
332             if (!fitsOnOneByte(character)) {
333                 if (GROUP_CHARACTERS_TERMINATOR == character)
334                     return INVALID_CHARACTER;
335                 character <<= 16;
336                 character += source.readUnsignedShort();
337             }
338             return character;
339         }
340     }
341 
342     /**
343      * Compute the binary size of the character array in a group
344      *
345      * If only one character, this is the size of this character. If many, it's the sum of their
346      * sizes + 1 byte for the terminator.
347      *
348      * @param group the group
349      * @return the size of the char array, including the terminator if any
350      */
getGroupCharactersSize(CharGroup group)351     private static int getGroupCharactersSize(CharGroup group) {
352         int size = CharEncoding.getCharArraySize(group.mChars);
353         if (group.hasSeveralChars()) size += GROUP_TERMINATOR_SIZE;
354         return size;
355     }
356 
357     /**
358      * Compute the binary size of the group count
359      * @param count the group count
360      * @return the size of the group count, either 1 or 2 bytes.
361      */
getGroupCountSize(final int count)362     private static int getGroupCountSize(final int count) {
363         if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= count) {
364             return 1;
365         } else if (MAX_CHARGROUPS_IN_A_NODE >= count) {
366             return 2;
367         } else {
368             throw new RuntimeException("Can't have more than " + MAX_CHARGROUPS_IN_A_NODE
369                     + " groups in a node (found " + count +")");
370         }
371     }
372 
373     /**
374      * Compute the binary size of the group count for a node
375      * @param node the node
376      * @return the size of the group count, either 1 or 2 bytes.
377      */
getGroupCountSize(final Node node)378     private static int getGroupCountSize(final Node node) {
379         return getGroupCountSize(node.mData.size());
380     }
381 
382     /**
383      * Compute the size of a shortcut in bytes.
384      */
getShortcutSize(final WeightedString shortcut)385     private static int getShortcutSize(final WeightedString shortcut) {
386         int size = GROUP_ATTRIBUTE_FLAGS_SIZE;
387         final String word = shortcut.mWord;
388         final int length = word.length();
389         for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
390             final int codePoint = word.codePointAt(i);
391             size += CharEncoding.getCharSize(codePoint);
392         }
393         size += GROUP_TERMINATOR_SIZE;
394         return size;
395     }
396 
397     /**
398      * Compute the size of a shortcut list in bytes.
399      *
400      * This is known in advance and does not change according to position in the file
401      * like address lists do.
402      */
getShortcutListSize(final ArrayList<WeightedString> shortcutList)403     private static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) {
404         if (null == shortcutList) return 0;
405         int size = GROUP_SHORTCUT_LIST_SIZE_SIZE;
406         for (final WeightedString shortcut : shortcutList) {
407             size += getShortcutSize(shortcut);
408         }
409         return size;
410     }
411 
412     /**
413      * Compute the maximum size of a CharGroup, assuming 3-byte addresses for everything.
414      *
415      * @param group the CharGroup to compute the size of.
416      * @return the maximum size of the group.
417      */
getCharGroupMaximumSize(CharGroup group)418     private static int getCharGroupMaximumSize(CharGroup group) {
419         int size = getGroupCharactersSize(group) + GROUP_FLAGS_SIZE;
420         // If terminal, one byte for the frequency
421         if (group.isTerminal()) size += GROUP_FREQUENCY_SIZE;
422         size += GROUP_MAX_ADDRESS_SIZE; // For children address
423         size += getShortcutListSize(group.mShortcutTargets);
424         if (null != group.mBigrams) {
425             size += (GROUP_ATTRIBUTE_FLAGS_SIZE + GROUP_ATTRIBUTE_MAX_ADDRESS_SIZE)
426                     * group.mBigrams.size();
427         }
428         return size;
429     }
430 
431     /**
432      * Compute the maximum size of a node, assuming 3-byte addresses for everything, and caches
433      * it in the 'actualSize' member of the node.
434      *
435      * @param node the node to compute the maximum size of.
436      */
setNodeMaximumSize(Node node)437     private static void setNodeMaximumSize(Node node) {
438         int size = getGroupCountSize(node);
439         for (CharGroup g : node.mData) {
440             final int groupSize = getCharGroupMaximumSize(g);
441             g.mCachedSize = groupSize;
442             size += groupSize;
443         }
444         node.mCachedSize = size;
445     }
446 
447     /**
448      * Helper method to hide the actual value of the no children address.
449      */
hasChildrenAddress(int address)450     private static boolean hasChildrenAddress(int address) {
451         return NO_CHILDREN_ADDRESS != address;
452     }
453 
454     /**
455      * Compute the size, in bytes, that an address will occupy.
456      *
457      * This can be used either for children addresses (which are always positive) or for
458      * attribute, which may be positive or negative but
459      * store their sign bit separately.
460      *
461      * @param address the address
462      * @return the byte size.
463      */
getByteSize(int address)464     private static int getByteSize(int address) {
465         assert(address < 0x1000000);
466         if (!hasChildrenAddress(address)) {
467             return 0;
468         } else if (Math.abs(address) < 0x100) {
469             return 1;
470         } else if (Math.abs(address) < 0x10000) {
471             return 2;
472         } else {
473             return 3;
474         }
475     }
476     // End utility methods.
477 
478     // This method is responsible for finding a nice ordering of the nodes that favors run-time
479     // cache performance and dictionary size.
480     /* package for tests */ static ArrayList<Node> flattenTree(Node root) {
481         final int treeSize = FusionDictionary.countCharGroups(root);
482         MakedictLog.i("Counted nodes : " + treeSize);
483         final ArrayList<Node> flatTree = new ArrayList<Node>(treeSize);
484         return flattenTreeInner(flatTree, root);
485     }
486 
487     private static ArrayList<Node> flattenTreeInner(ArrayList<Node> list, Node node) {
488         // Removing the node is necessary if the tails are merged, because we would then
489         // add the same node several times when we only want it once. A number of places in
490         // the code also depends on any node being only once in the list.
491         // Merging tails can only be done if there are no attributes. Searching for attributes
492         // in LatinIME code depends on a total breadth-first ordering, which merging tails
493         // breaks. If there are no attributes, it should be fine (and reduce the file size)
494         // to merge tails, and removing the node from the list would be necessary. However,
495         // we don't merge tails because breaking the breadth-first ordering would result in
496         // extreme overhead at bigram lookup time (it would make the search function O(n) instead
497         // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty
498         // high).
499         // If no nodes are ever merged, we can't have the same node twice in the list, hence
500         // searching for duplicates in unnecessary. It is also very performance consuming,
501         // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making
502         // this simple list.remove operation O(n*n) overall. On Android this overhead is very
503         // high.
504         // For future reference, the code to remove duplicate is a simple : list.remove(node);
505         list.add(node);
506         final ArrayList<CharGroup> branches = node.mData;
507         final int nodeSize = branches.size();
508         for (CharGroup group : branches) {
509             if (null != group.mChildren) flattenTreeInner(list, group.mChildren);
510         }
511         return list;
512     }
513 
514     /**
515      * Finds the absolute address of a word in the dictionary.
516      *
517      * @param dict the dictionary in which to search.
518      * @param word the word we are searching for.
519      * @return the word address. If it is not found, an exception is thrown.
520      */
521     private static int findAddressOfWord(final FusionDictionary dict, final String word) {
522         return FusionDictionary.findWordInTree(dict.mRoot, word).mCachedAddress;
523     }
524 
525     /**
526      * Computes the actual node size, based on the cached addresses of the children nodes.
527      *
528      * Each node stores its tentative address. During dictionary address computing, these
529      * are not final, but they can be used to compute the node size (the node size depends
530      * on the address of the children because the number of bytes necessary to store an
531      * address depends on its numeric value. The return value indicates whether the node
532      * contents (as in, any of the addresses stored in the cache fields) have changed with
533      * respect to their previous value.
534      *
535      * @param node the node to compute the size of.
536      * @param dict the dictionary in which the word/attributes are to be found.
537      * @return false if none of the cached addresses inside the node changed, true otherwise.
538      */
539     private static boolean computeActualNodeSize(Node node, FusionDictionary dict) {
540         boolean changed = false;
541         int size = getGroupCountSize(node);
542         for (CharGroup group : node.mData) {
543             if (group.mCachedAddress != node.mCachedAddress + size) {
544                 changed = true;
545                 group.mCachedAddress = node.mCachedAddress + size;
546             }
547             int groupSize = GROUP_FLAGS_SIZE + getGroupCharactersSize(group);
548             if (group.isTerminal()) groupSize += GROUP_FREQUENCY_SIZE;
549             if (null != group.mChildren) {
550                 final int offsetBasePoint= groupSize + node.mCachedAddress + size;
551                 final int offset = group.mChildren.mCachedAddress - offsetBasePoint;
552                 groupSize += getByteSize(offset);
553             }
554             groupSize += getShortcutListSize(group.mShortcutTargets);
555             if (null != group.mBigrams) {
556                 for (WeightedString bigram : group.mBigrams) {
557                     final int offsetBasePoint = groupSize + node.mCachedAddress + size
558                             + GROUP_FLAGS_SIZE;
559                     final int addressOfBigram = findAddressOfWord(dict, bigram.mWord);
560                     final int offset = addressOfBigram - offsetBasePoint;
561                     groupSize += getByteSize(offset) + GROUP_FLAGS_SIZE;
562                 }
563             }
564             group.mCachedSize = groupSize;
565             size += groupSize;
566         }
567         if (node.mCachedSize != size) {
568             node.mCachedSize = size;
569             changed = true;
570         }
571         return changed;
572     }
573 
574     /**
575      * Computes the byte size of a list of nodes and updates each node cached position.
576      *
577      * @param flatNodes the array of nodes.
578      * @return the byte size of the entire stack.
579      */
580     private static int stackNodes(ArrayList<Node> flatNodes) {
581         int nodeOffset = 0;
582         for (Node n : flatNodes) {
583             n.mCachedAddress = nodeOffset;
584             int groupCountSize = getGroupCountSize(n);
585             int groupOffset = 0;
586             for (CharGroup g : n.mData) {
587                 g.mCachedAddress = groupCountSize + nodeOffset + groupOffset;
588                 groupOffset += g.mCachedSize;
589             }
590             if (groupOffset + groupCountSize != n.mCachedSize) {
591                 throw new RuntimeException("Bug : Stored and computed node size differ");
592             }
593             nodeOffset += n.mCachedSize;
594         }
595         return nodeOffset;
596     }
597 
598     /**
599      * Compute the addresses and sizes of an ordered node array.
600      *
601      * This method takes a node array and will update its cached address and size values
602      * so that they can be written into a file. It determines the smallest size each of the
603      * nodes can be given the addresses of its children and attributes, and store that into
604      * each node.
605      * The order of the node is given by the order of the array. This method makes no effort
606      * to find a good order; it only mechanically computes the size this order results in.
607      *
608      * @param dict the dictionary
609      * @param flatNodes the ordered array of nodes
610      * @return the same array it was passed. The nodes have been updated for address and size.
611      */
612     private static ArrayList<Node> computeAddresses(FusionDictionary dict,
613             ArrayList<Node> flatNodes) {
614         // First get the worst sizes and offsets
615         for (Node n : flatNodes) setNodeMaximumSize(n);
616         final int offset = stackNodes(flatNodes);
617 
618         MakedictLog.i("Compressing the array addresses. Original size : " + offset);
619         MakedictLog.i("(Recursively seen size : " + offset + ")");
620 
621         int passes = 0;
622         boolean changesDone = false;
623         do {
624             changesDone = false;
625             for (Node n : flatNodes) {
626                 final int oldNodeSize = n.mCachedSize;
627                 final boolean changed = computeActualNodeSize(n, dict);
628                 final int newNodeSize = n.mCachedSize;
629                 if (oldNodeSize < newNodeSize) throw new RuntimeException("Increased size ?!");
630                 changesDone |= changed;
631             }
632             stackNodes(flatNodes);
633             ++passes;
634             if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug");
635         } while (changesDone);
636 
637         final Node lastNode = flatNodes.get(flatNodes.size() - 1);
638         MakedictLog.i("Compression complete in " + passes + " passes.");
639         MakedictLog.i("After address compression : "
640                 + (lastNode.mCachedAddress + lastNode.mCachedSize));
641 
642         return flatNodes;
643     }
644 
645     /**
646      * Sanity-checking method.
647      *
648      * This method checks an array of node for juxtaposition, that is, it will do
649      * nothing if each node's cached address is actually the previous node's address
650      * plus the previous node's size.
651      * If this is not the case, it will throw an exception.
652      *
653      * @param array the array node to check
654      */
655     private static void checkFlatNodeArray(ArrayList<Node> array) {
656         int offset = 0;
657         int index = 0;
658         for (Node n : array) {
659             if (n.mCachedAddress != offset) {
660                 throw new RuntimeException("Wrong address for node " + index
661                         + " : expected " + offset + ", got " + n.mCachedAddress);
662             }
663             ++index;
664             offset += n.mCachedSize;
665         }
666     }
667 
668     /**
669      * Helper method to write a variable-size address to a file.
670      *
671      * @param buffer the buffer to write to.
672      * @param index the index in the buffer to write the address to.
673      * @param address the address to write.
674      * @return the size in bytes the address actually took.
675      */
676     private static int writeVariableAddress(final byte[] buffer, int index, final int address) {
677         switch (getByteSize(address)) {
678         case 1:
679             buffer[index++] = (byte)address;
680             return 1;
681         case 2:
682             buffer[index++] = (byte)(0xFF & (address >> 8));
683             buffer[index++] = (byte)(0xFF & address);
684             return 2;
685         case 3:
686             buffer[index++] = (byte)(0xFF & (address >> 16));
687             buffer[index++] = (byte)(0xFF & (address >> 8));
688             buffer[index++] = (byte)(0xFF & address);
689             return 3;
690         case 0:
691             return 0;
692         default:
693             throw new RuntimeException("Address " + address + " has a strange size");
694         }
695     }
696 
makeCharGroupFlags(final CharGroup group, final int groupAddress, final int childrenOffset)697     private static byte makeCharGroupFlags(final CharGroup group, final int groupAddress,
698             final int childrenOffset) {
699         byte flags = 0;
700         if (group.mChars.length > 1) flags |= FLAG_HAS_MULTIPLE_CHARS;
701         if (group.mFrequency >= 0) {
702             flags |= FLAG_IS_TERMINAL;
703         }
704         if (null != group.mChildren) {
705             switch (getByteSize(childrenOffset)) {
706              case 1:
707                  flags |= FLAG_GROUP_ADDRESS_TYPE_ONEBYTE;
708                  break;
709              case 2:
710                  flags |= FLAG_GROUP_ADDRESS_TYPE_TWOBYTES;
711                  break;
712              case 3:
713                  flags |= FLAG_GROUP_ADDRESS_TYPE_THREEBYTES;
714                  break;
715              default:
716                  throw new RuntimeException("Node with a strange address");
717              }
718         }
719         if (null != group.mShortcutTargets) {
720             if (DBG && 0 == group.mShortcutTargets.size()) {
721                 throw new RuntimeException("0-sized shortcut list must be null");
722             }
723             flags |= FLAG_HAS_SHORTCUT_TARGETS;
724         }
725         if (null != group.mBigrams) {
726             if (DBG && 0 == group.mBigrams.size()) {
727                 throw new RuntimeException("0-sized bigram list must be null");
728             }
729             flags |= FLAG_HAS_BIGRAMS;
730         }
731         return flags;
732     }
733 
734     /**
735      * Makes the flag value for a bigram.
736      *
737      * @param more whether there are more bigrams after this one.
738      * @param offset the offset of the bigram.
739      * @param bigramFrequency the frequency of the bigram, 0..255.
740      * @param unigramFrequency the unigram frequency of the same word, 0..255.
741      * @param word the second bigram, for debugging purposes
742      * @return the flags
743      */
makeBigramFlags(final boolean more, final int offset, int bigramFrequency, final int unigramFrequency, final String word)744     private static final int makeBigramFlags(final boolean more, final int offset,
745             int bigramFrequency, final int unigramFrequency, final String word) {
746         int bigramFlags = (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0)
747                 + (offset < 0 ? FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0);
748         switch (getByteSize(offset)) {
749         case 1:
750             bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
751             break;
752         case 2:
753             bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
754             break;
755         case 3:
756             bigramFlags |= FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
757             break;
758         default:
759             throw new RuntimeException("Strange offset size");
760         }
761         if (unigramFrequency > bigramFrequency) {
762             MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word
763                     + "\". Bigram freq is " + bigramFrequency + ", unigram freq for "
764                     + word + " is " + unigramFrequency);
765             bigramFrequency = unigramFrequency;
766         }
767         // We compute the difference between 255 (which means probability = 1) and the
768         // unigram score. We split this into a number of discrete steps.
769         // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15
770         // represents an increase of 16 steps: a value of 15 will be interpreted as the median
771         // value of the 16th step. In all justice, if the bigram frequency is low enough to be
772         // rounded below the first step (which means it is less than half a step higher than the
773         // unigram frequency) then the unigram frequency itself is the best approximation of the
774         // bigram freq that we could possibly supply, hence we should *not* include this bigram
775         // in the file at all.
776         // until this is done, we'll write 0 and slightly overestimate this case.
777         // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step
778         // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to
779         // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the
780         // step size. Then we compute the start of the first step (the one where value 0 starts)
781         // by adding half-a-step to the unigramFrequency. From there, we compute the integer
782         // number of steps to the bigramFrequency. One last thing: we want our steps to include
783         // their lower bound and exclude their higher bound so we need to have the first step
784         // start at exactly 1 unit higher than floor(unigramFreq + half a step).
785         // Note : to reconstruct the score, the dictionary reader will need to divide
786         // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise, and add
787         // (discretizedFrequency + 0.5) times this value to get the median value of the step,
788         // which is the best approximation. This is how we get the most precise result with
789         // only four bits.
790         final double stepSize =
791                 (double)(MAX_TERMINAL_FREQUENCY - unigramFrequency) / (1.5 + MAX_BIGRAM_FREQUENCY);
792         final double firstStepStart = 1 + unigramFrequency + (stepSize / 2.0);
793         final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize);
794         // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1
795         // here. The best approximation would be the unigram freq itself, so we should not
796         // include this bigram in the dictionary. For now, register as 0, and live with the
797         // small over-estimation that we get in this case. TODO: actually remove this bigram
798         // if discretizedFrequency < 0.
799         final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0;
800         bigramFlags += finalBigramFrequency & FLAG_ATTRIBUTE_FREQUENCY;
801         return bigramFlags;
802     }
803 
804     /**
805      * Makes the 2-byte value for options flags.
806      */
makeOptionsValue(final FusionDictionary dictionary)807     private static final int makeOptionsValue(final FusionDictionary dictionary) {
808         final DictionaryOptions options = dictionary.mOptions;
809         final boolean hasBigrams = dictionary.hasBigrams();
810         return (options.mFrenchLigatureProcessing ? FRENCH_LIGATURE_PROCESSING_FLAG : 0)
811                 + (options.mGermanUmlautProcessing ? GERMAN_UMLAUT_PROCESSING_FLAG : 0)
812                 + (hasBigrams ? CONTAINS_BIGRAMS_FLAG : 0);
813     }
814 
815     /**
816      * Makes the flag value for a shortcut.
817      *
818      * @param more whether there are more attributes after this one.
819      * @param frequency the frequency of the attribute, 0..15
820      * @return the flags
821      */
makeShortcutFlags(final boolean more, final int frequency)822     private static final int makeShortcutFlags(final boolean more, final int frequency) {
823         return (more ? FLAG_ATTRIBUTE_HAS_NEXT : 0) + (frequency & FLAG_ATTRIBUTE_FREQUENCY);
824     }
825 
826     /**
827      * Write a node to memory. The node is expected to have its final position cached.
828      *
829      * This can be an empty map, but the more is inside the faster the lookups will be. It can
830      * be carried on as long as nodes do not move.
831      *
832      * @param dict the dictionary the node is a part of (for relative offsets).
833      * @param buffer the memory buffer to write to.
834      * @param node the node to write.
835      * @return the address of the END of the node.
836      */
writePlacedNode(FusionDictionary dict, byte[] buffer, Node node)837     private static int writePlacedNode(FusionDictionary dict, byte[] buffer, Node node) {
838         int index = node.mCachedAddress;
839 
840         final int groupCount = node.mData.size();
841         final int countSize = getGroupCountSize(node);
842         if (1 == countSize) {
843             buffer[index++] = (byte)groupCount;
844         } else if (2 == countSize) {
845             // We need to signal 2-byte size by setting the top bit of the MSB to 1, so
846             // we | 0x80 to do this.
847             buffer[index++] = (byte)((groupCount >> 8) | 0x80);
848             buffer[index++] = (byte)(groupCount & 0xFF);
849         } else {
850             throw new RuntimeException("Strange size from getGroupCountSize : " + countSize);
851         }
852         int groupAddress = index;
853         for (int i = 0; i < groupCount; ++i) {
854             CharGroup group = node.mData.get(i);
855             if (index != group.mCachedAddress) throw new RuntimeException("Bug: write index is not "
856                     + "the same as the cached address of the group : "
857                     + index + " <> " + group.mCachedAddress);
858             groupAddress += GROUP_FLAGS_SIZE + getGroupCharactersSize(group);
859             // Sanity checks.
860             if (DBG && group.mFrequency > MAX_TERMINAL_FREQUENCY) {
861                 throw new RuntimeException("A node has a frequency > " + MAX_TERMINAL_FREQUENCY
862                         + " : " + group.mFrequency);
863             }
864             if (group.mFrequency >= 0) groupAddress += GROUP_FREQUENCY_SIZE;
865             final int childrenOffset = null == group.mChildren
866                     ? NO_CHILDREN_ADDRESS : group.mChildren.mCachedAddress - groupAddress;
867             byte flags = makeCharGroupFlags(group, groupAddress, childrenOffset);
868             buffer[index++] = flags;
869             index = CharEncoding.writeCharArray(group.mChars, buffer, index);
870             if (group.hasSeveralChars()) {
871                 buffer[index++] = GROUP_CHARACTERS_TERMINATOR;
872             }
873             if (group.mFrequency >= 0) {
874                 buffer[index++] = (byte) group.mFrequency;
875             }
876             final int shift = writeVariableAddress(buffer, index, childrenOffset);
877             index += shift;
878             groupAddress += shift;
879 
880             // Write shortcuts
881             if (null != group.mShortcutTargets) {
882                 final int indexOfShortcutByteSize = index;
883                 index += GROUP_SHORTCUT_LIST_SIZE_SIZE;
884                 groupAddress += GROUP_SHORTCUT_LIST_SIZE_SIZE;
885                 final Iterator<WeightedString> shortcutIterator = group.mShortcutTargets.iterator();
886                 while (shortcutIterator.hasNext()) {
887                     final WeightedString target = shortcutIterator.next();
888                     ++groupAddress;
889                     int shortcutFlags = makeShortcutFlags(shortcutIterator.hasNext(),
890                             target.mFrequency);
891                     buffer[index++] = (byte)shortcutFlags;
892                     final int shortcutShift = CharEncoding.writeString(buffer, index, target.mWord);
893                     index += shortcutShift;
894                     groupAddress += shortcutShift;
895                 }
896                 final int shortcutByteSize = index - indexOfShortcutByteSize;
897                 if (shortcutByteSize > 0xFFFF) {
898                     throw new RuntimeException("Shortcut list too large");
899                 }
900                 buffer[indexOfShortcutByteSize] = (byte)(shortcutByteSize >> 8);
901                 buffer[indexOfShortcutByteSize + 1] = (byte)(shortcutByteSize & 0xFF);
902             }
903             // Write bigrams
904             if (null != group.mBigrams) {
905                 final Iterator<WeightedString> bigramIterator = group.mBigrams.iterator();
906                 while (bigramIterator.hasNext()) {
907                     final WeightedString bigram = bigramIterator.next();
908                     final CharGroup target =
909                             FusionDictionary.findWordInTree(dict.mRoot, bigram.mWord);
910                     final int addressOfBigram = target.mCachedAddress;
911                     final int unigramFrequencyForThisWord = target.mFrequency;
912                     ++groupAddress;
913                     final int offset = addressOfBigram - groupAddress;
914                     int bigramFlags = makeBigramFlags(bigramIterator.hasNext(), offset,
915                             bigram.mFrequency, unigramFrequencyForThisWord, bigram.mWord);
916                     buffer[index++] = (byte)bigramFlags;
917                     final int bigramShift = writeVariableAddress(buffer, index, Math.abs(offset));
918                     index += bigramShift;
919                     groupAddress += bigramShift;
920                 }
921             }
922 
923         }
924         if (index != node.mCachedAddress + node.mCachedSize) throw new RuntimeException(
925                 "Not the same size : written "
926                 + (index - node.mCachedAddress) + " bytes out of a node that should have "
927                 + node.mCachedSize + " bytes");
928         return index;
929     }
930 
931     /**
932      * Dumps a collection of useful statistics about a node array.
933      *
934      * This prints purely informative stuff, like the total estimated file size, the
935      * number of nodes, of character groups, the repartition of each address size, etc
936      *
937      * @param nodes the node array.
938      */
showStatistics(ArrayList<Node> nodes)939     private static void showStatistics(ArrayList<Node> nodes) {
940         int firstTerminalAddress = Integer.MAX_VALUE;
941         int lastTerminalAddress = Integer.MIN_VALUE;
942         int size = 0;
943         int charGroups = 0;
944         int maxGroups = 0;
945         int maxRuns = 0;
946         for (Node n : nodes) {
947             if (maxGroups < n.mData.size()) maxGroups = n.mData.size();
948             for (CharGroup cg : n.mData) {
949                 ++charGroups;
950                 if (cg.mChars.length > maxRuns) maxRuns = cg.mChars.length;
951                 if (cg.mFrequency >= 0) {
952                     if (n.mCachedAddress < firstTerminalAddress)
953                         firstTerminalAddress = n.mCachedAddress;
954                     if (n.mCachedAddress > lastTerminalAddress)
955                         lastTerminalAddress = n.mCachedAddress;
956                 }
957             }
958             if (n.mCachedAddress + n.mCachedSize > size) size = n.mCachedAddress + n.mCachedSize;
959         }
960         final int[] groupCounts = new int[maxGroups + 1];
961         final int[] runCounts = new int[maxRuns + 1];
962         for (Node n : nodes) {
963             ++groupCounts[n.mData.size()];
964             for (CharGroup cg : n.mData) {
965                 ++runCounts[cg.mChars.length];
966             }
967         }
968 
969         MakedictLog.i("Statistics:\n"
970                 + "  total file size " + size + "\n"
971                 + "  " + nodes.size() + " nodes\n"
972                 + "  " + charGroups + " groups (" + ((float)charGroups / nodes.size())
973                         + " groups per node)\n"
974                 + "  first terminal at " + firstTerminalAddress + "\n"
975                 + "  last terminal at " + lastTerminalAddress + "\n"
976                 + "  Group stats : max = " + maxGroups);
977         for (int i = 0; i < groupCounts.length; ++i) {
978             MakedictLog.i("    " + i + " : " + groupCounts[i]);
979         }
980         MakedictLog.i("  Character run stats : max = " + maxRuns);
981         for (int i = 0; i < runCounts.length; ++i) {
982             MakedictLog.i("    " + i + " : " + runCounts[i]);
983         }
984     }
985 
986     /**
987      * Dumps a FusionDictionary to a file.
988      *
989      * This is the public entry point to write a dictionary to a file.
990      *
991      * @param destination the stream to write the binary data to.
992      * @param dict the dictionary to write.
993      * @param version the version of the format to write, currently either 1 or 2.
994      */
writeDictionaryBinary(final OutputStream destination, final FusionDictionary dict, final int version)995     public static void writeDictionaryBinary(final OutputStream destination,
996             final FusionDictionary dict, final int version)
997             throws IOException, UnsupportedFormatException {
998 
999         // Addresses are limited to 3 bytes, but since addresses can be relative to each node, the
1000         // structure itself is not limited to 16MB. However, if it is over 16MB deciding the order
1001         // of the nodes becomes a quite complicated problem, because though the dictionary itself
1002         // does not have a size limit, each node must still be within 16MB of all its children and
1003         // parents. As long as this is ensured, the dictionary file may grow to any size.
1004 
1005         if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION) {
1006             throw new UnsupportedFormatException("Requested file format version " + version
1007                     + ", but this implementation only supports versions "
1008                     + MINIMUM_SUPPORTED_VERSION + " through " + MAXIMUM_SUPPORTED_VERSION);
1009         }
1010 
1011         ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256);
1012 
1013         // The magic number in big-endian order.
1014         if (version >= FIRST_VERSION_WITH_HEADER_SIZE) {
1015             // Magic number for version 2+.
1016             headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 24)));
1017             headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 16)));
1018             headerBuffer.write((byte) (0xFF & (VERSION_2_MAGIC_NUMBER >> 8)));
1019             headerBuffer.write((byte) (0xFF & VERSION_2_MAGIC_NUMBER));
1020             // Dictionary version.
1021             headerBuffer.write((byte) (0xFF & (version >> 8)));
1022             headerBuffer.write((byte) (0xFF & version));
1023         } else {
1024             // Magic number for version 1.
1025             headerBuffer.write((byte) (0xFF & (VERSION_1_MAGIC_NUMBER >> 8)));
1026             headerBuffer.write((byte) (0xFF & VERSION_1_MAGIC_NUMBER));
1027             // Dictionary version.
1028             headerBuffer.write((byte) (0xFF & version));
1029         }
1030         // Options flags
1031         final int options = makeOptionsValue(dict);
1032         headerBuffer.write((byte) (0xFF & (options >> 8)));
1033         headerBuffer.write((byte) (0xFF & options));
1034         if (version >= FIRST_VERSION_WITH_HEADER_SIZE) {
1035             final int headerSizeOffset = headerBuffer.size();
1036             // Placeholder to be written later with header size.
1037             for (int i = 0; i < 4; ++i) {
1038                 headerBuffer.write(0);
1039             }
1040             // Write out the options.
1041             for (final String key : dict.mOptions.mAttributes.keySet()) {
1042                 final String value = dict.mOptions.mAttributes.get(key);
1043                 CharEncoding.writeString(headerBuffer, key);
1044                 CharEncoding.writeString(headerBuffer, value);
1045             }
1046             final int size = headerBuffer.size();
1047             final byte[] bytes = headerBuffer.toByteArray();
1048             // Write out the header size.
1049             bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24));
1050             bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16));
1051             bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8));
1052             bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0));
1053             destination.write(bytes);
1054         } else {
1055             headerBuffer.writeTo(destination);
1056         }
1057 
1058         headerBuffer.close();
1059 
1060         // Leave the choice of the optimal node order to the flattenTree function.
1061         MakedictLog.i("Flattening the tree...");
1062         ArrayList<Node> flatNodes = flattenTree(dict.mRoot);
1063 
1064         MakedictLog.i("Computing addresses...");
1065         computeAddresses(dict, flatNodes);
1066         MakedictLog.i("Checking array...");
1067         if (DBG) checkFlatNodeArray(flatNodes);
1068 
1069         // Create a buffer that matches the final dictionary size.
1070         final Node lastNode = flatNodes.get(flatNodes.size() - 1);
1071         final int bufferSize =(lastNode.mCachedAddress + lastNode.mCachedSize);
1072         final byte[] buffer = new byte[bufferSize];
1073         int index = 0;
1074 
1075         MakedictLog.i("Writing file...");
1076         int dataEndOffset = 0;
1077         for (Node n : flatNodes) {
1078             dataEndOffset = writePlacedNode(dict, buffer, n);
1079         }
1080 
1081         if (DBG) showStatistics(flatNodes);
1082 
1083         destination.write(buffer, 0, dataEndOffset);
1084 
1085         destination.close();
1086         MakedictLog.i("Done");
1087     }
1088 
1089 
1090     // Input methods: Read a binary dictionary to memory.
1091     // readDictionaryBinary is the public entry point for them.
1092 
1093     static final int[] characterBuffer = new int[MAX_WORD_LENGTH];
readCharGroup(RandomAccessFile source, final int originalGroupAddress)1094     private static CharGroupInfo readCharGroup(RandomAccessFile source,
1095             final int originalGroupAddress) throws IOException {
1096         int addressPointer = originalGroupAddress;
1097         final int flags = source.readUnsignedByte();
1098         ++addressPointer;
1099         final int characters[];
1100         if (0 != (flags & FLAG_HAS_MULTIPLE_CHARS)) {
1101             int index = 0;
1102             int character = CharEncoding.readChar(source);
1103             addressPointer += CharEncoding.getCharSize(character);
1104             while (-1 != character) {
1105                 characterBuffer[index++] = character;
1106                 character = CharEncoding.readChar(source);
1107                 addressPointer += CharEncoding.getCharSize(character);
1108             }
1109             characters = Arrays.copyOfRange(characterBuffer, 0, index);
1110         } else {
1111             final int character = CharEncoding.readChar(source);
1112             addressPointer += CharEncoding.getCharSize(character);
1113             characters = new int[] { character };
1114         }
1115         final int frequency;
1116         if (0 != (FLAG_IS_TERMINAL & flags)) {
1117             ++addressPointer;
1118             frequency = source.readUnsignedByte();
1119         } else {
1120             frequency = CharGroup.NOT_A_TERMINAL;
1121         }
1122         int childrenAddress = addressPointer;
1123         switch (flags & MASK_GROUP_ADDRESS_TYPE) {
1124         case FLAG_GROUP_ADDRESS_TYPE_ONEBYTE:
1125             childrenAddress += source.readUnsignedByte();
1126             addressPointer += 1;
1127             break;
1128         case FLAG_GROUP_ADDRESS_TYPE_TWOBYTES:
1129             childrenAddress += source.readUnsignedShort();
1130             addressPointer += 2;
1131             break;
1132         case FLAG_GROUP_ADDRESS_TYPE_THREEBYTES:
1133             childrenAddress += (source.readUnsignedByte() << 16) + source.readUnsignedShort();
1134             addressPointer += 3;
1135             break;
1136         case FLAG_GROUP_ADDRESS_TYPE_NOADDRESS:
1137         default:
1138             childrenAddress = NO_CHILDREN_ADDRESS;
1139             break;
1140         }
1141         ArrayList<WeightedString> shortcutTargets = null;
1142         if (0 != (flags & FLAG_HAS_SHORTCUT_TARGETS)) {
1143             final long pointerBefore = source.getFilePointer();
1144             shortcutTargets = new ArrayList<WeightedString>();
1145             source.readUnsignedShort(); // Skip the size
1146             while (true) {
1147                 final int targetFlags = source.readUnsignedByte();
1148                 final String word = CharEncoding.readString(source);
1149                 shortcutTargets.add(new WeightedString(word,
1150                         targetFlags & FLAG_ATTRIBUTE_FREQUENCY));
1151                 if (0 == (targetFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break;
1152             }
1153             addressPointer += (source.getFilePointer() - pointerBefore);
1154         }
1155         ArrayList<PendingAttribute> bigrams = null;
1156         if (0 != (flags & FLAG_HAS_BIGRAMS)) {
1157             bigrams = new ArrayList<PendingAttribute>();
1158             while (true) {
1159                 final int bigramFlags = source.readUnsignedByte();
1160                 ++addressPointer;
1161                 final int sign = 0 == (bigramFlags & FLAG_ATTRIBUTE_OFFSET_NEGATIVE) ? 1 : -1;
1162                 int bigramAddress = addressPointer;
1163                 switch (bigramFlags & MASK_ATTRIBUTE_ADDRESS_TYPE) {
1164                 case FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
1165                     bigramAddress += sign * source.readUnsignedByte();
1166                     addressPointer += 1;
1167                     break;
1168                 case FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
1169                     bigramAddress += sign * source.readUnsignedShort();
1170                     addressPointer += 2;
1171                     break;
1172                 case FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
1173                     final int offset = ((source.readUnsignedByte() << 16)
1174                             + source.readUnsignedShort());
1175                     bigramAddress += sign * offset;
1176                     addressPointer += 3;
1177                     break;
1178                 default:
1179                     throw new RuntimeException("Has bigrams with no address");
1180                 }
1181                 bigrams.add(new PendingAttribute(bigramFlags & FLAG_ATTRIBUTE_FREQUENCY,
1182                         bigramAddress));
1183                 if (0 == (bigramFlags & FLAG_ATTRIBUTE_HAS_NEXT)) break;
1184             }
1185         }
1186         return new CharGroupInfo(originalGroupAddress, addressPointer, flags, characters, frequency,
1187                 childrenAddress, shortcutTargets, bigrams);
1188     }
1189 
1190     /**
1191      * Reads and returns the char group count out of a file and forwards the pointer.
1192      */
readCharGroupCount(RandomAccessFile source)1193     private static int readCharGroupCount(RandomAccessFile source) throws IOException {
1194         final int msb = source.readUnsignedByte();
1195         if (MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT >= msb) {
1196             return msb;
1197         } else {
1198             return ((MAX_CHARGROUPS_FOR_ONE_BYTE_CHARGROUP_COUNT & msb) << 8)
1199                     + source.readUnsignedByte();
1200         }
1201     }
1202 
1203     // The word cache here is a stopgap bandaid to help the catastrophic performance
1204     // of this method. Since it performs direct, unbuffered random access to the file and
1205     // may be called hundreds of thousands of times, the resulting performance is not
1206     // reasonable without some kind of cache. Thus:
1207     // TODO: perform buffered I/O here and in other places in the code.
1208     private static TreeMap<Integer, String> wordCache = new TreeMap<Integer, String>();
1209     /**
1210      * Finds, as a string, the word at the address passed as an argument.
1211      *
1212      * @param source the file to read from.
1213      * @param headerSize the size of the header.
1214      * @param address the address to seek.
1215      * @return the word, as a string.
1216      * @throws IOException if the file can't be read.
1217      */
getWordAtAddress(final RandomAccessFile source, final long headerSize, int address)1218     private static String getWordAtAddress(final RandomAccessFile source, final long headerSize,
1219             int address) throws IOException {
1220         final String cachedString = wordCache.get(address);
1221         if (null != cachedString) return cachedString;
1222         final long originalPointer = source.getFilePointer();
1223         source.seek(headerSize);
1224         final int count = readCharGroupCount(source);
1225         int groupOffset = getGroupCountSize(count);
1226         final StringBuilder builder = new StringBuilder();
1227         String result = null;
1228 
1229         CharGroupInfo last = null;
1230         for (int i = count - 1; i >= 0; --i) {
1231             CharGroupInfo info = readCharGroup(source, groupOffset);
1232             groupOffset = info.mEndAddress;
1233             if (info.mOriginalAddress == address) {
1234                 builder.append(new String(info.mCharacters, 0, info.mCharacters.length));
1235                 result = builder.toString();
1236                 break; // and return
1237             }
1238             if (hasChildrenAddress(info.mChildrenAddress)) {
1239                 if (info.mChildrenAddress > address) {
1240                     if (null == last) continue;
1241                     builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
1242                     source.seek(last.mChildrenAddress + headerSize);
1243                     groupOffset = last.mChildrenAddress + 1;
1244                     i = source.readUnsignedByte();
1245                     last = null;
1246                     continue;
1247                 }
1248                 last = info;
1249             }
1250             if (0 == i && hasChildrenAddress(last.mChildrenAddress)) {
1251                 builder.append(new String(last.mCharacters, 0, last.mCharacters.length));
1252                 source.seek(last.mChildrenAddress + headerSize);
1253                 groupOffset = last.mChildrenAddress + 1;
1254                 i = source.readUnsignedByte();
1255                 last = null;
1256                 continue;
1257             }
1258         }
1259         source.seek(originalPointer);
1260         wordCache.put(address, result);
1261         return result;
1262     }
1263 
1264     /**
1265      * Reads a single node from a binary file.
1266      *
1267      * This methods reads the file at the current position of its file pointer. A node is
1268      * fully expected to start at the current position.
1269      * This will recursively read other nodes into the structure, populating the reverse
1270      * maps on the fly and using them to keep track of already read nodes.
1271      *
1272      * @param source the data file, correctly positioned at the start of a node.
1273      * @param headerSize the size, in bytes, of the file header.
1274      * @param reverseNodeMap a mapping from addresses to already read nodes.
1275      * @param reverseGroupMap a mapping from addresses to already read character groups.
1276      * @return the read node with all his children already read.
1277      */
readNode(RandomAccessFile source, long headerSize, Map<Integer, Node> reverseNodeMap, Map<Integer, CharGroup> reverseGroupMap)1278     private static Node readNode(RandomAccessFile source, long headerSize,
1279             Map<Integer, Node> reverseNodeMap, Map<Integer, CharGroup> reverseGroupMap)
1280             throws IOException {
1281         final int nodeOrigin = (int)(source.getFilePointer() - headerSize);
1282         final int count = readCharGroupCount(source);
1283         final ArrayList<CharGroup> nodeContents = new ArrayList<CharGroup>();
1284         int groupOffset = nodeOrigin + getGroupCountSize(count);
1285         for (int i = count; i > 0; --i) {
1286             CharGroupInfo info = readCharGroup(source, groupOffset);
1287             ArrayList<WeightedString> shortcutTargets = info.mShortcutTargets;
1288             ArrayList<WeightedString> bigrams = null;
1289             if (null != info.mBigrams) {
1290                 bigrams = new ArrayList<WeightedString>();
1291                 for (PendingAttribute bigram : info.mBigrams) {
1292                     final String word = getWordAtAddress(source, headerSize, bigram.mAddress);
1293                     bigrams.add(new WeightedString(word, bigram.mFrequency));
1294                 }
1295             }
1296             if (hasChildrenAddress(info.mChildrenAddress)) {
1297                 Node children = reverseNodeMap.get(info.mChildrenAddress);
1298                 if (null == children) {
1299                     final long currentPosition = source.getFilePointer();
1300                     source.seek(info.mChildrenAddress + headerSize);
1301                     children = readNode(source, headerSize, reverseNodeMap, reverseGroupMap);
1302                     source.seek(currentPosition);
1303                 }
1304                 nodeContents.add(
1305                         new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency,
1306                                 children));
1307             } else {
1308                 nodeContents.add(
1309                         new CharGroup(info.mCharacters, shortcutTargets, bigrams, info.mFrequency));
1310             }
1311             groupOffset = info.mEndAddress;
1312         }
1313         final Node node = new Node(nodeContents);
1314         node.mCachedAddress = nodeOrigin;
1315         reverseNodeMap.put(node.mCachedAddress, node);
1316         return node;
1317     }
1318 
1319     /**
1320      * Helper function to get the binary format version from the header.
1321      */
getFormatVersion(final RandomAccessFile source)1322     private static int getFormatVersion(final RandomAccessFile source) throws IOException {
1323         final int magic_v1 = source.readUnsignedShort();
1324         if (VERSION_1_MAGIC_NUMBER == magic_v1) return source.readUnsignedByte();
1325         final int magic_v2 = (magic_v1 << 16) + source.readUnsignedShort();
1326         if (VERSION_2_MAGIC_NUMBER == magic_v2) return source.readUnsignedShort();
1327         return NOT_A_VERSION_NUMBER;
1328     }
1329 
1330     /**
1331      * Reads a random access file and returns the memory representation of the dictionary.
1332      *
1333      * This high-level method takes a binary file and reads its contents, populating a
1334      * FusionDictionary structure. The optional dict argument is an existing dictionary to
1335      * which words from the file should be added. If it is null, a new dictionary is created.
1336      *
1337      * @param source the file to read.
1338      * @param dict an optional dictionary to add words to, or null.
1339      * @return the created (or merged) dictionary.
1340      */
readDictionaryBinary(final RandomAccessFile source, final FusionDictionary dict)1341     public static FusionDictionary readDictionaryBinary(final RandomAccessFile source,
1342             final FusionDictionary dict) throws IOException, UnsupportedFormatException {
1343         // Check file version
1344         final int version = getFormatVersion(source);
1345         if (version < MINIMUM_SUPPORTED_VERSION || version > MAXIMUM_SUPPORTED_VERSION ) {
1346             throw new UnsupportedFormatException("This file has version " + version
1347                     + ", but this implementation does not support versions above "
1348                     + MAXIMUM_SUPPORTED_VERSION);
1349         }
1350 
1351         // Read options
1352         final int optionsFlags = source.readUnsignedShort();
1353 
1354         final long headerSize;
1355         final HashMap<String, String> options = new HashMap<String, String>();
1356         if (version < FIRST_VERSION_WITH_HEADER_SIZE) {
1357             headerSize = source.getFilePointer();
1358         } else {
1359             headerSize = (source.readUnsignedByte() << 24) + (source.readUnsignedByte() << 16)
1360                     + (source.readUnsignedByte() << 8) + source.readUnsignedByte();
1361             while (source.getFilePointer() < headerSize) {
1362                 final String key = CharEncoding.readString(source);
1363                 final String value = CharEncoding.readString(source);
1364                 options.put(key, value);
1365             }
1366             source.seek(headerSize);
1367         }
1368 
1369         Map<Integer, Node> reverseNodeMapping = new TreeMap<Integer, Node>();
1370         Map<Integer, CharGroup> reverseGroupMapping = new TreeMap<Integer, CharGroup>();
1371         final Node root = readNode(source, headerSize, reverseNodeMapping, reverseGroupMapping);
1372 
1373         FusionDictionary newDict = new FusionDictionary(root,
1374                 new FusionDictionary.DictionaryOptions(options,
1375                         0 != (optionsFlags & GERMAN_UMLAUT_PROCESSING_FLAG),
1376                         0 != (optionsFlags & FRENCH_LIGATURE_PROCESSING_FLAG)));
1377         if (null != dict) {
1378             for (final Word w : dict) {
1379                 newDict.add(w.mWord, w.mFrequency, w.mShortcutTargets);
1380             }
1381             for (final Word w : dict) {
1382                 // By construction a binary dictionary may not have bigrams pointing to
1383                 // words that are not also registered as unigrams so we don't have to avoid
1384                 // them explicitly here.
1385                 for (final WeightedString bigram : w.mBigrams) {
1386                     newDict.setBigram(w.mWord, bigram.mWord, bigram.mFrequency);
1387                 }
1388             }
1389         }
1390 
1391         return newDict;
1392     }
1393 
1394     /**
1395      * Basic test to find out whether the file is a binary dictionary or not.
1396      *
1397      * Concretely this only tests the magic number.
1398      *
1399      * @param filename The name of the file to test.
1400      * @return true if it's a binary dictionary, false otherwise
1401      */
isBinaryDictionary(final String filename)1402     public static boolean isBinaryDictionary(final String filename) {
1403         try {
1404             RandomAccessFile f = new RandomAccessFile(filename, "r");
1405             final int version = getFormatVersion(f);
1406             return (version >= MINIMUM_SUPPORTED_VERSION && version <= MAXIMUM_SUPPORTED_VERSION);
1407         } catch (FileNotFoundException e) {
1408             return false;
1409         } catch (IOException e) {
1410             return false;
1411         }
1412     }
1413 }
1414