• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2013 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.latin.makedict.BinaryDictDecoderUtils.CharEncoding;
20 import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
21 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNode;
22 import com.android.inputmethod.latin.makedict.FusionDictionary.DictionaryOptions;
23 import com.android.inputmethod.latin.makedict.FusionDictionary.PtNodeArray;
24 import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
25 
26 import java.io.ByteArrayOutputStream;
27 import java.io.IOException;
28 import java.io.OutputStream;
29 import java.util.ArrayList;
30 
31 /**
32  * Encodes binary files for a FusionDictionary.
33  *
34  * All the methods in this class are static.
35  *
36  * TODO: Rename this class to DictEncoderUtils.
37  */
38 public class BinaryDictEncoderUtils {
39 
40     private static final boolean DBG = MakedictLog.DBG;
41 
BinaryDictEncoderUtils()42     private BinaryDictEncoderUtils() {
43         // This utility class is not publicly instantiable.
44     }
45 
46     // Arbitrary limit to how much passes we consider address size compression should
47     // terminate in. At the time of this writing, our largest dictionary completes
48     // compression in five passes.
49     // If the number of passes exceeds this number, makedict bails with an exception on
50     // suspicion that a bug might be causing an infinite loop.
51     private static final int MAX_PASSES = 24;
52 
53     /**
54      * Compute the binary size of the character array.
55      *
56      * If only one character, this is the size of this character. If many, it's the sum of their
57      * sizes + 1 byte for the terminator.
58      *
59      * @param characters the character array
60      * @return the size of the char array, including the terminator if any
61      */
getPtNodeCharactersSize(final int[] characters)62     static int getPtNodeCharactersSize(final int[] characters) {
63         int size = CharEncoding.getCharArraySize(characters);
64         if (characters.length > 1) size += FormatSpec.PTNODE_TERMINATOR_SIZE;
65         return size;
66     }
67 
68     /**
69      * Compute the binary size of the character array in a PtNode
70      *
71      * If only one character, this is the size of this character. If many, it's the sum of their
72      * sizes + 1 byte for the terminator.
73      *
74      * @param ptNode the PtNode
75      * @return the size of the char array, including the terminator if any
76      */
getPtNodeCharactersSize(final PtNode ptNode)77     private static int getPtNodeCharactersSize(final PtNode ptNode) {
78         return getPtNodeCharactersSize(ptNode.mChars);
79     }
80 
81     /**
82      * Compute the binary size of the PtNode count for a node array.
83      * @param nodeArray the nodeArray
84      * @return the size of the PtNode count, either 1 or 2 bytes.
85      */
getPtNodeCountSize(final PtNodeArray nodeArray)86     private static int getPtNodeCountSize(final PtNodeArray nodeArray) {
87         return BinaryDictIOUtils.getPtNodeCountSize(nodeArray.mData.size());
88     }
89 
90     /**
91      * Compute the size of a shortcut in bytes.
92      */
getShortcutSize(final WeightedString shortcut)93     private static int getShortcutSize(final WeightedString shortcut) {
94         int size = FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE;
95         final String word = shortcut.mWord;
96         final int length = word.length();
97         for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
98             final int codePoint = word.codePointAt(i);
99             size += CharEncoding.getCharSize(codePoint);
100         }
101         size += FormatSpec.PTNODE_TERMINATOR_SIZE;
102         return size;
103     }
104 
105     /**
106      * Compute the size of a shortcut list in bytes.
107      *
108      * This is known in advance and does not change according to position in the file
109      * like address lists do.
110      */
getShortcutListSize(final ArrayList<WeightedString> shortcutList)111     static int getShortcutListSize(final ArrayList<WeightedString> shortcutList) {
112         if (null == shortcutList || shortcutList.isEmpty()) return 0;
113         int size = FormatSpec.PTNODE_SHORTCUT_LIST_SIZE_SIZE;
114         for (final WeightedString shortcut : shortcutList) {
115             size += getShortcutSize(shortcut);
116         }
117         return size;
118     }
119 
120     /**
121      * Compute the maximum size of a PtNode, assuming 3-byte addresses for everything.
122      *
123      * @param ptNode the PtNode to compute the size of.
124      * @param options file format options.
125      * @return the maximum size of the PtNode.
126      */
getPtNodeMaximumSize(final PtNode ptNode, final FormatOptions options)127     private static int getPtNodeMaximumSize(final PtNode ptNode, final FormatOptions options) {
128         int size = getNodeHeaderSize(ptNode, options);
129         if (ptNode.isTerminal()) {
130             // If terminal, one byte for the frequency or four bytes for the terminal id.
131             if (options.mHasTerminalId) {
132                 size += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
133             } else {
134                 size += FormatSpec.PTNODE_FREQUENCY_SIZE;
135             }
136         }
137         size += FormatSpec.PTNODE_MAX_ADDRESS_SIZE; // For children address
138         size += getShortcutListSize(ptNode.mShortcutTargets);
139         if (null != ptNode.mBigrams) {
140             size += (FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE
141                     + FormatSpec.PTNODE_ATTRIBUTE_MAX_ADDRESS_SIZE)
142                     * ptNode.mBigrams.size();
143         }
144         return size;
145     }
146 
147     /**
148      * Compute the maximum size of each PtNode of a PtNode array, assuming 3-byte addresses for
149      * everything, and caches it in the `mCachedSize' member of the nodes; deduce the size of
150      * the containing node array, and cache it it its 'mCachedSize' member.
151      *
152      * @param ptNodeArray the node array to compute the maximum size of.
153      * @param options file format options.
154      */
calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray, final FormatOptions options)155     private static void calculatePtNodeArrayMaximumSize(final PtNodeArray ptNodeArray,
156             final FormatOptions options) {
157         int size = getPtNodeCountSize(ptNodeArray);
158         for (PtNode node : ptNodeArray.mData) {
159             final int nodeSize = getPtNodeMaximumSize(node, options);
160             node.mCachedSize = nodeSize;
161             size += nodeSize;
162         }
163         if (options.mSupportsDynamicUpdate) {
164             size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
165         }
166         ptNodeArray.mCachedSize = size;
167     }
168 
169     /**
170      * Compute the size of the header (flag + [parent address] + characters size) of a PtNode.
171      *
172      * @param ptNode the PtNode of which to compute the size of the header
173      * @param options file format options.
174      */
getNodeHeaderSize(final PtNode ptNode, final FormatOptions options)175     private static int getNodeHeaderSize(final PtNode ptNode, final FormatOptions options) {
176         if (BinaryDictIOUtils.supportsDynamicUpdate(options)) {
177             return FormatSpec.PTNODE_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
178                     + getPtNodeCharactersSize(ptNode);
179         } else {
180             return FormatSpec.PTNODE_FLAGS_SIZE + getPtNodeCharactersSize(ptNode);
181         }
182     }
183 
184     /**
185      * Compute the size, in bytes, that an address will occupy.
186      *
187      * This can be used either for children addresses (which are always positive) or for
188      * attribute, which may be positive or negative but
189      * store their sign bit separately.
190      *
191      * @param address the address
192      * @return the byte size.
193      */
getByteSize(final int address)194     static int getByteSize(final int address) {
195         assert(address <= FormatSpec.UINT24_MAX);
196         if (!BinaryDictIOUtils.hasChildrenAddress(address)) {
197             return 0;
198         } else if (Math.abs(address) <= FormatSpec.UINT8_MAX) {
199             return 1;
200         } else if (Math.abs(address) <= FormatSpec.UINT16_MAX) {
201             return 2;
202         } else {
203             return 3;
204         }
205     }
206 
writeUIntToBuffer(final byte[] buffer, int position, final int value, final int size)207     static int writeUIntToBuffer(final byte[] buffer, int position, final int value,
208             final int size) {
209         switch(size) {
210             case 4:
211                 buffer[position++] = (byte) ((value >> 24) & 0xFF);
212                 /* fall through */
213             case 3:
214                 buffer[position++] = (byte) ((value >> 16) & 0xFF);
215                 /* fall through */
216             case 2:
217                 buffer[position++] = (byte) ((value >> 8) & 0xFF);
218                 /* fall through */
219             case 1:
220                 buffer[position++] = (byte) (value & 0xFF);
221                 break;
222             default:
223                 /* nop */
224         }
225         return position;
226     }
227 
writeUIntToStream(final OutputStream stream, final int value, final int size)228     static void writeUIntToStream(final OutputStream stream, final int value, final int size)
229             throws IOException {
230         switch(size) {
231             case 4:
232                 stream.write((value >> 24) & 0xFF);
233                 /* fall through */
234             case 3:
235                 stream.write((value >> 16) & 0xFF);
236                 /* fall through */
237             case 2:
238                 stream.write((value >> 8) & 0xFF);
239                 /* fall through */
240             case 1:
241                 stream.write(value & 0xFF);
242                 break;
243             default:
244                 /* nop */
245         }
246     }
247 
248     // End utility methods
249 
250     // This method is responsible for finding a nice ordering of the nodes that favors run-time
251     // cache performance and dictionary size.
flattenTree( final PtNodeArray rootNodeArray)252     /* package for tests */ static ArrayList<PtNodeArray> flattenTree(
253             final PtNodeArray rootNodeArray) {
254         final int treeSize = FusionDictionary.countPtNodes(rootNodeArray);
255         MakedictLog.i("Counted nodes : " + treeSize);
256         final ArrayList<PtNodeArray> flatTree = new ArrayList<PtNodeArray>(treeSize);
257         return flattenTreeInner(flatTree, rootNodeArray);
258     }
259 
flattenTreeInner(final ArrayList<PtNodeArray> list, final PtNodeArray ptNodeArray)260     private static ArrayList<PtNodeArray> flattenTreeInner(final ArrayList<PtNodeArray> list,
261             final PtNodeArray ptNodeArray) {
262         // Removing the node is necessary if the tails are merged, because we would then
263         // add the same node several times when we only want it once. A number of places in
264         // the code also depends on any node being only once in the list.
265         // Merging tails can only be done if there are no attributes. Searching for attributes
266         // in LatinIME code depends on a total breadth-first ordering, which merging tails
267         // breaks. If there are no attributes, it should be fine (and reduce the file size)
268         // to merge tails, and removing the node from the list would be necessary. However,
269         // we don't merge tails because breaking the breadth-first ordering would result in
270         // extreme overhead at bigram lookup time (it would make the search function O(n) instead
271         // of the current O(log(n)), where n=number of nodes in the dictionary which is pretty
272         // high).
273         // If no nodes are ever merged, we can't have the same node twice in the list, hence
274         // searching for duplicates in unnecessary. It is also very performance consuming,
275         // since `list' is an ArrayList so it's an O(n) operation that runs on all nodes, making
276         // this simple list.remove operation O(n*n) overall. On Android this overhead is very
277         // high.
278         // For future reference, the code to remove duplicate is a simple : list.remove(node);
279         list.add(ptNodeArray);
280         final ArrayList<PtNode> branches = ptNodeArray.mData;
281         for (PtNode ptNode : branches) {
282             if (null != ptNode.mChildren) flattenTreeInner(list, ptNode.mChildren);
283         }
284         return list;
285     }
286 
287     /**
288      * Get the offset from a position inside a current node array to a target node array, during
289      * update.
290      *
291      * If the current node array is before the target node array, the target node array has not
292      * been updated yet, so we should return the offset from the old position of the current node
293      * array to the old position of the target node array. If on the other hand the target is
294      * before the current node array, it already has been updated, so we should return the offset
295      * from the new position in the current node array to the new position in the target node
296      * array.
297      *
298      * @param currentNodeArray node array containing the PtNode where the offset will be written
299      * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray
300      * @param targetNodeArray the target node array to get the offset to
301      * @return the offset to the target node array
302      */
getOffsetToTargetNodeArrayDuringUpdate(final PtNodeArray currentNodeArray, final int offsetFromStartOfCurrentNodeArray, final PtNodeArray targetNodeArray)303     private static int getOffsetToTargetNodeArrayDuringUpdate(final PtNodeArray currentNodeArray,
304             final int offsetFromStartOfCurrentNodeArray, final PtNodeArray targetNodeArray) {
305         final boolean isTargetBeforeCurrent = (targetNodeArray.mCachedAddressBeforeUpdate
306                 < currentNodeArray.mCachedAddressBeforeUpdate);
307         if (isTargetBeforeCurrent) {
308             return targetNodeArray.mCachedAddressAfterUpdate
309                     - (currentNodeArray.mCachedAddressAfterUpdate
310                             + offsetFromStartOfCurrentNodeArray);
311         } else {
312             return targetNodeArray.mCachedAddressBeforeUpdate
313                     - (currentNodeArray.mCachedAddressBeforeUpdate
314                             + offsetFromStartOfCurrentNodeArray);
315         }
316     }
317 
318     /**
319      * Get the offset from a position inside a current node array to a target PtNode, during
320      * update.
321      *
322      * @param currentNodeArray node array containing the PtNode where the offset will be written
323      * @param offsetFromStartOfCurrentNodeArray offset, in bytes, from the start of currentNodeArray
324      * @param targetPtNode the target PtNode to get the offset to
325      * @return the offset to the target PtNode
326      */
327     // TODO: is there any way to factorize this method with the one above?
getOffsetToTargetPtNodeDuringUpdate(final PtNodeArray currentNodeArray, final int offsetFromStartOfCurrentNodeArray, final PtNode targetPtNode)328     private static int getOffsetToTargetPtNodeDuringUpdate(final PtNodeArray currentNodeArray,
329             final int offsetFromStartOfCurrentNodeArray, final PtNode targetPtNode) {
330         final int oldOffsetBasePoint = currentNodeArray.mCachedAddressBeforeUpdate
331                 + offsetFromStartOfCurrentNodeArray;
332         final boolean isTargetBeforeCurrent = (targetPtNode.mCachedAddressBeforeUpdate
333                 < oldOffsetBasePoint);
334         // If the target is before the current node array, then its address has already been
335         // updated. We can use the AfterUpdate member, and compare it to our own member after
336         // update. Otherwise, the AfterUpdate member is not updated yet, so we need to use the
337         // BeforeUpdate member, and of course we have to compare this to our own address before
338         // update.
339         if (isTargetBeforeCurrent) {
340             final int newOffsetBasePoint = currentNodeArray.mCachedAddressAfterUpdate
341                     + offsetFromStartOfCurrentNodeArray;
342             return targetPtNode.mCachedAddressAfterUpdate - newOffsetBasePoint;
343         } else {
344             return targetPtNode.mCachedAddressBeforeUpdate - oldOffsetBasePoint;
345         }
346     }
347 
348     /**
349      * Computes the actual node array size, based on the cached addresses of the children nodes.
350      *
351      * Each node array stores its tentative address. During dictionary address computing, these
352      * are not final, but they can be used to compute the node array size (the node array size
353      * depends on the address of the children because the number of bytes necessary to store an
354      * address depends on its numeric value. The return value indicates whether the node array
355      * contents (as in, any of the addresses stored in the cache fields) have changed with
356      * respect to their previous value.
357      *
358      * @param ptNodeArray the node array to compute the size of.
359      * @param dict the dictionary in which the word/attributes are to be found.
360      * @param formatOptions file format options.
361      * @return false if none of the cached addresses inside the node array changed, true otherwise.
362      */
computeActualPtNodeArraySize(final PtNodeArray ptNodeArray, final FusionDictionary dict, final FormatOptions formatOptions)363     private static boolean computeActualPtNodeArraySize(final PtNodeArray ptNodeArray,
364             final FusionDictionary dict, final FormatOptions formatOptions) {
365         boolean changed = false;
366         int size = getPtNodeCountSize(ptNodeArray);
367         for (PtNode ptNode : ptNodeArray.mData) {
368             ptNode.mCachedAddressAfterUpdate = ptNodeArray.mCachedAddressAfterUpdate + size;
369             if (ptNode.mCachedAddressAfterUpdate != ptNode.mCachedAddressBeforeUpdate) {
370                 changed = true;
371             }
372             int nodeSize = getNodeHeaderSize(ptNode, formatOptions);
373             if (ptNode.isTerminal()) {
374                 if (formatOptions.mHasTerminalId) {
375                     nodeSize += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
376                 } else {
377                     nodeSize += FormatSpec.PTNODE_FREQUENCY_SIZE;
378                 }
379             }
380             if (formatOptions.mSupportsDynamicUpdate) {
381                 nodeSize += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
382             } else if (null != ptNode.mChildren) {
383                 nodeSize += getByteSize(getOffsetToTargetNodeArrayDuringUpdate(ptNodeArray,
384                         nodeSize + size, ptNode.mChildren));
385             }
386             if (formatOptions.mVersion < FormatSpec.FIRST_VERSION_WITH_TERMINAL_ID) {
387                 nodeSize += getShortcutListSize(ptNode.mShortcutTargets);
388                 if (null != ptNode.mBigrams) {
389                     for (WeightedString bigram : ptNode.mBigrams) {
390                         final int offset = getOffsetToTargetPtNodeDuringUpdate(ptNodeArray,
391                                 nodeSize + size + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE,
392                                 FusionDictionary.findWordInTree(dict.mRootNodeArray, bigram.mWord));
393                         nodeSize += getByteSize(offset) + FormatSpec.PTNODE_ATTRIBUTE_FLAGS_SIZE;
394                     }
395                 }
396             }
397             ptNode.mCachedSize = nodeSize;
398             size += nodeSize;
399         }
400         if (formatOptions.mSupportsDynamicUpdate) {
401             size += FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
402         }
403         if (ptNodeArray.mCachedSize != size) {
404             ptNodeArray.mCachedSize = size;
405             changed = true;
406         }
407         return changed;
408     }
409 
410     /**
411      * Initializes the cached addresses of node arrays and their containing nodes from their size.
412      *
413      * @param flatNodes the list of node arrays.
414      * @param formatOptions file format options.
415      * @return the byte size of the entire stack.
416      */
initializePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes, final FormatOptions formatOptions)417     private static int initializePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes,
418             final FormatOptions formatOptions) {
419         int nodeArrayOffset = 0;
420         for (final PtNodeArray nodeArray : flatNodes) {
421             nodeArray.mCachedAddressBeforeUpdate = nodeArrayOffset;
422             int nodeCountSize = getPtNodeCountSize(nodeArray);
423             int nodeffset = 0;
424             for (final PtNode ptNode : nodeArray.mData) {
425                 ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate =
426                         nodeCountSize + nodeArrayOffset + nodeffset;
427                 nodeffset += ptNode.mCachedSize;
428             }
429             nodeArrayOffset += nodeArray.mCachedSize;
430         }
431         return nodeArrayOffset;
432     }
433 
434     /**
435      * Updates the cached addresses of node arrays after recomputing their new positions.
436      *
437      * @param flatNodes the list of node arrays.
438      */
updatePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes)439     private static void updatePtNodeArraysCachedAddresses(final ArrayList<PtNodeArray> flatNodes) {
440         for (final PtNodeArray nodeArray : flatNodes) {
441             nodeArray.mCachedAddressBeforeUpdate = nodeArray.mCachedAddressAfterUpdate;
442             for (final PtNode ptNode : nodeArray.mData) {
443                 ptNode.mCachedAddressBeforeUpdate = ptNode.mCachedAddressAfterUpdate;
444             }
445         }
446     }
447 
448     /**
449      * Compute the cached parent addresses after all has been updated.
450      *
451      * The parent addresses are used by some binary formats at write-to-disk time. Not all formats
452      * need them. In particular, version 2 does not need them, and version 3 does.
453      *
454      * @param flatNodes the flat array of node arrays to fill in
455      */
computeParentAddresses(final ArrayList<PtNodeArray> flatNodes)456     private static void computeParentAddresses(final ArrayList<PtNodeArray> flatNodes) {
457         for (final PtNodeArray nodeArray : flatNodes) {
458             for (final PtNode ptNode : nodeArray.mData) {
459                 if (null != ptNode.mChildren) {
460                     // Assign my address to children's parent address
461                     // Here BeforeUpdate and AfterUpdate addresses have the same value, so it
462                     // does not matter which we use.
463                     ptNode.mChildren.mCachedParentAddress = ptNode.mCachedAddressAfterUpdate
464                             - ptNode.mChildren.mCachedAddressAfterUpdate;
465                 }
466             }
467         }
468     }
469 
470     /**
471      * Compute the addresses and sizes of an ordered list of PtNode arrays.
472      *
473      * This method takes a list of PtNode arrays and will update their cached address and size
474      * values so that they can be written into a file. It determines the smallest size each of the
475      * PtNode arrays can be given the addresses of its children and attributes, and store that into
476      * each PtNode.
477      * The order of the PtNode is given by the order of the array. This method makes no effort
478      * to find a good order; it only mechanically computes the size this order results in.
479      *
480      * @param dict the dictionary
481      * @param flatNodes the ordered list of PtNode arrays
482      * @param formatOptions file format options.
483      * @return the same array it was passed. The nodes have been updated for address and size.
484      */
computeAddresses(final FusionDictionary dict, final ArrayList<PtNodeArray> flatNodes, final FormatOptions formatOptions)485     /* package */ static ArrayList<PtNodeArray> computeAddresses(final FusionDictionary dict,
486             final ArrayList<PtNodeArray> flatNodes, final FormatOptions formatOptions) {
487         // First get the worst possible sizes and offsets
488         for (final PtNodeArray n : flatNodes) calculatePtNodeArrayMaximumSize(n, formatOptions);
489         final int offset = initializePtNodeArraysCachedAddresses(flatNodes, formatOptions);
490 
491         MakedictLog.i("Compressing the array addresses. Original size : " + offset);
492         MakedictLog.i("(Recursively seen size : " + offset + ")");
493 
494         int passes = 0;
495         boolean changesDone = false;
496         do {
497             changesDone = false;
498             int ptNodeArrayStartOffset = 0;
499             for (final PtNodeArray ptNodeArray : flatNodes) {
500                 ptNodeArray.mCachedAddressAfterUpdate = ptNodeArrayStartOffset;
501                 final int oldNodeArraySize = ptNodeArray.mCachedSize;
502                 final boolean changed =
503                         computeActualPtNodeArraySize(ptNodeArray, dict, formatOptions);
504                 final int newNodeArraySize = ptNodeArray.mCachedSize;
505                 if (oldNodeArraySize < newNodeArraySize) {
506                     throw new RuntimeException("Increased size ?!");
507                 }
508                 ptNodeArrayStartOffset += newNodeArraySize;
509                 changesDone |= changed;
510             }
511             updatePtNodeArraysCachedAddresses(flatNodes);
512             ++passes;
513             if (passes > MAX_PASSES) throw new RuntimeException("Too many passes - probably a bug");
514         } while (changesDone);
515 
516         if (formatOptions.mSupportsDynamicUpdate) {
517             computeParentAddresses(flatNodes);
518         }
519         final PtNodeArray lastPtNodeArray = flatNodes.get(flatNodes.size() - 1);
520         MakedictLog.i("Compression complete in " + passes + " passes.");
521         MakedictLog.i("After address compression : "
522                 + (lastPtNodeArray.mCachedAddressAfterUpdate + lastPtNodeArray.mCachedSize));
523 
524         return flatNodes;
525     }
526 
527     /**
528      * Sanity-checking method.
529      *
530      * This method checks a list of PtNode arrays for juxtaposition, that is, it will do
531      * nothing if each node array's cached address is actually the previous node array's address
532      * plus the previous node's size.
533      * If this is not the case, it will throw an exception.
534      *
535      * @param arrays the list of node arrays to check
536      */
checkFlatPtNodeArrayList(final ArrayList<PtNodeArray> arrays)537     /* package */ static void checkFlatPtNodeArrayList(final ArrayList<PtNodeArray> arrays) {
538         int offset = 0;
539         int index = 0;
540         for (final PtNodeArray ptNodeArray : arrays) {
541             // BeforeUpdate and AfterUpdate addresses are the same here, so it does not matter
542             // which we use.
543             if (ptNodeArray.mCachedAddressAfterUpdate != offset) {
544                 throw new RuntimeException("Wrong address for node " + index
545                         + " : expected " + offset + ", got " +
546                         ptNodeArray.mCachedAddressAfterUpdate);
547             }
548             ++index;
549             offset += ptNodeArray.mCachedSize;
550         }
551     }
552 
553     /**
554      * Helper method to write a children position to a file.
555      *
556      * @param buffer the buffer to write to.
557      * @param index the index in the buffer to write the address to.
558      * @param position the position to write.
559      * @return the size in bytes the address actually took.
560      */
writeChildrenPosition(final byte[] buffer, int index, final int position)561     /* package */ static int writeChildrenPosition(final byte[] buffer, int index,
562             final int position) {
563         switch (getByteSize(position)) {
564         case 1:
565             buffer[index++] = (byte)position;
566             return 1;
567         case 2:
568             buffer[index++] = (byte)(0xFF & (position >> 8));
569             buffer[index++] = (byte)(0xFF & position);
570             return 2;
571         case 3:
572             buffer[index++] = (byte)(0xFF & (position >> 16));
573             buffer[index++] = (byte)(0xFF & (position >> 8));
574             buffer[index++] = (byte)(0xFF & position);
575             return 3;
576         case 0:
577             return 0;
578         default:
579             throw new RuntimeException("Position " + position + " has a strange size");
580         }
581     }
582 
583     /**
584      * Helper method to write a signed children position to a file.
585      *
586      * @param buffer the buffer to write to.
587      * @param index the index in the buffer to write the address to.
588      * @param position the position to write.
589      * @return the size in bytes the address actually took.
590      */
writeSignedChildrenPosition(final byte[] buffer, int index, final int position)591     /* package */ static int writeSignedChildrenPosition(final byte[] buffer, int index,
592             final int position) {
593         if (!BinaryDictIOUtils.hasChildrenAddress(position)) {
594             buffer[index] = buffer[index + 1] = buffer[index + 2] = 0;
595         } else {
596             final int absPosition = Math.abs(position);
597             buffer[index++] =
598                     (byte)((position < 0 ? FormatSpec.MSB8 : 0) | (0xFF & (absPosition >> 16)));
599             buffer[index++] = (byte)(0xFF & (absPosition >> 8));
600             buffer[index++] = (byte)(0xFF & absPosition);
601         }
602         return 3;
603     }
604 
605     /**
606      * Makes the flag value for a PtNode.
607      *
608      * @param hasMultipleChars whether the PtNode has multiple chars.
609      * @param isTerminal whether the PtNode is terminal.
610      * @param childrenAddressSize the size of a children address.
611      * @param hasShortcuts whether the PtNode has shortcuts.
612      * @param hasBigrams whether the PtNode has bigrams.
613      * @param isNotAWord whether the PtNode is not a word.
614      * @param isBlackListEntry whether the PtNode is a blacklist entry.
615      * @param formatOptions file format options.
616      * @return the flags
617      */
makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal, final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams, final boolean isNotAWord, final boolean isBlackListEntry, final FormatOptions formatOptions)618     static int makePtNodeFlags(final boolean hasMultipleChars, final boolean isTerminal,
619             final int childrenAddressSize, final boolean hasShortcuts, final boolean hasBigrams,
620             final boolean isNotAWord, final boolean isBlackListEntry,
621             final FormatOptions formatOptions) {
622         byte flags = 0;
623         if (hasMultipleChars) flags |= FormatSpec.FLAG_HAS_MULTIPLE_CHARS;
624         if (isTerminal) flags |= FormatSpec.FLAG_IS_TERMINAL;
625         if (formatOptions.mSupportsDynamicUpdate) {
626             flags |= FormatSpec.FLAG_IS_NOT_MOVED;
627         } else if (true) {
628             switch (childrenAddressSize) {
629                 case 1:
630                     flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_ONEBYTE;
631                     break;
632                 case 2:
633                     flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_TWOBYTES;
634                     break;
635                 case 3:
636                     flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_THREEBYTES;
637                     break;
638                 case 0:
639                     flags |= FormatSpec.FLAG_CHILDREN_ADDRESS_TYPE_NOADDRESS;
640                     break;
641                 default:
642                     throw new RuntimeException("Node with a strange address");
643             }
644         }
645         if (hasShortcuts) flags |= FormatSpec.FLAG_HAS_SHORTCUT_TARGETS;
646         if (hasBigrams) flags |= FormatSpec.FLAG_HAS_BIGRAMS;
647         if (isNotAWord) flags |= FormatSpec.FLAG_IS_NOT_A_WORD;
648         if (isBlackListEntry) flags |= FormatSpec.FLAG_IS_BLACKLISTED;
649         return flags;
650     }
651 
makePtNodeFlags(final PtNode node, final int childrenOffset, final FormatOptions formatOptions)652     /* package */ static byte makePtNodeFlags(final PtNode node, final int childrenOffset,
653             final FormatOptions formatOptions) {
654         return (byte) makePtNodeFlags(node.mChars.length > 1, node.mFrequency >= 0,
655                 getByteSize(childrenOffset),
656                 node.mShortcutTargets != null && !node.mShortcutTargets.isEmpty(),
657                 node.mBigrams != null, node.mIsNotAWord, node.mIsBlacklistEntry, formatOptions);
658     }
659 
660     /**
661      * Makes the flag value for a bigram.
662      *
663      * @param more whether there are more bigrams after this one.
664      * @param offset the offset of the bigram.
665      * @param bigramFrequency the frequency of the bigram, 0..255.
666      * @param unigramFrequency the unigram frequency of the same word, 0..255.
667      * @param word the second bigram, for debugging purposes
668      * @return the flags
669      */
makeBigramFlags(final boolean more, final int offset, int bigramFrequency, final int unigramFrequency, final String word)670     /* package */ static final int makeBigramFlags(final boolean more, final int offset,
671             int bigramFrequency, final int unigramFrequency, final String word) {
672         int bigramFlags = (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0)
673                 + (offset < 0 ? FormatSpec.FLAG_BIGRAM_ATTR_OFFSET_NEGATIVE : 0);
674         switch (getByteSize(offset)) {
675         case 1:
676             bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_ONEBYTE;
677             break;
678         case 2:
679             bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_TWOBYTES;
680             break;
681         case 3:
682             bigramFlags |= FormatSpec.FLAG_BIGRAM_ATTR_ADDRESS_TYPE_THREEBYTES;
683             break;
684         default:
685             throw new RuntimeException("Strange offset size");
686         }
687         if (unigramFrequency > bigramFrequency) {
688             MakedictLog.e("Unigram freq is superior to bigram freq for \"" + word
689                     + "\". Bigram freq is " + bigramFrequency + ", unigram freq for "
690                     + word + " is " + unigramFrequency);
691             bigramFrequency = unigramFrequency;
692         }
693         // We compute the difference between 255 (which means probability = 1) and the
694         // unigram score. We split this into a number of discrete steps.
695         // Now, the steps are numbered 0~15; 0 represents an increase of 1 step while 15
696         // represents an increase of 16 steps: a value of 15 will be interpreted as the median
697         // value of the 16th step. In all justice, if the bigram frequency is low enough to be
698         // rounded below the first step (which means it is less than half a step higher than the
699         // unigram frequency) then the unigram frequency itself is the best approximation of the
700         // bigram freq that we could possibly supply, hence we should *not* include this bigram
701         // in the file at all.
702         // until this is done, we'll write 0 and slightly overestimate this case.
703         // In other words, 0 means "between 0.5 step and 1.5 step", 1 means "between 1.5 step
704         // and 2.5 steps", and 15 means "between 15.5 steps and 16.5 steps". So we want to
705         // divide our range [unigramFreq..MAX_TERMINAL_FREQUENCY] in 16.5 steps to get the
706         // step size. Then we compute the start of the first step (the one where value 0 starts)
707         // by adding half-a-step to the unigramFrequency. From there, we compute the integer
708         // number of steps to the bigramFrequency. One last thing: we want our steps to include
709         // their lower bound and exclude their higher bound so we need to have the first step
710         // start at exactly 1 unit higher than floor(unigramFreq + half a step).
711         // Note : to reconstruct the score, the dictionary reader will need to divide
712         // MAX_TERMINAL_FREQUENCY - unigramFreq by 16.5 likewise to get the value of the step,
713         // and add (discretizedFrequency + 0.5 + 0.5) times this value to get the best
714         // approximation. (0.5 to get the first step start, and 0.5 to get the middle of the
715         // step pointed by the discretized frequency.
716         final float stepSize =
717                 (FormatSpec.MAX_TERMINAL_FREQUENCY - unigramFrequency)
718                 / (1.5f + FormatSpec.MAX_BIGRAM_FREQUENCY);
719         final float firstStepStart = 1 + unigramFrequency + (stepSize / 2.0f);
720         final int discretizedFrequency = (int)((bigramFrequency - firstStepStart) / stepSize);
721         // If the bigram freq is less than half-a-step higher than the unigram freq, we get -1
722         // here. The best approximation would be the unigram freq itself, so we should not
723         // include this bigram in the dictionary. For now, register as 0, and live with the
724         // small over-estimation that we get in this case. TODO: actually remove this bigram
725         // if discretizedFrequency < 0.
726         final int finalBigramFrequency = discretizedFrequency > 0 ? discretizedFrequency : 0;
727         bigramFlags += finalBigramFrequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY;
728         return bigramFlags;
729     }
730 
731     /**
732      * Makes the 2-byte value for options flags.
733      */
makeOptionsValue(final FusionDictionary dictionary, final FormatOptions formatOptions)734     private static final int makeOptionsValue(final FusionDictionary dictionary,
735             final FormatOptions formatOptions) {
736         final DictionaryOptions options = dictionary.mOptions;
737         final boolean hasBigrams = dictionary.hasBigrams();
738         return (options.mFrenchLigatureProcessing ? FormatSpec.FRENCH_LIGATURE_PROCESSING_FLAG : 0)
739                 + (options.mGermanUmlautProcessing ? FormatSpec.GERMAN_UMLAUT_PROCESSING_FLAG : 0)
740                 + (hasBigrams ? FormatSpec.CONTAINS_BIGRAMS_FLAG : 0)
741                 + (formatOptions.mSupportsDynamicUpdate ? FormatSpec.SUPPORTS_DYNAMIC_UPDATE : 0);
742     }
743 
744     /**
745      * Makes the flag value for a shortcut.
746      *
747      * @param more whether there are more attributes after this one.
748      * @param frequency the frequency of the attribute, 0..15
749      * @return the flags
750      */
makeShortcutFlags(final boolean more, final int frequency)751     static final int makeShortcutFlags(final boolean more, final int frequency) {
752         return (more ? FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_HAS_NEXT : 0)
753                 + (frequency & FormatSpec.FLAG_BIGRAM_SHORTCUT_ATTR_FREQUENCY);
754     }
755 
writeParentAddress(final byte[] buffer, final int index, final int address, final FormatOptions formatOptions)756     /* package */ static final int writeParentAddress(final byte[] buffer, final int index,
757             final int address, final FormatOptions formatOptions) {
758         if (BinaryDictIOUtils.supportsDynamicUpdate(formatOptions)) {
759             if (address == FormatSpec.NO_PARENT_ADDRESS) {
760                 buffer[index] = buffer[index + 1] = buffer[index + 2] = 0;
761             } else {
762                 final int absAddress = Math.abs(address);
763                 assert(absAddress <= FormatSpec.SINT24_MAX);
764                 buffer[index] = (byte)((address < 0 ? FormatSpec.MSB8 : 0)
765                         | ((absAddress >> 16) & 0xFF));
766                 buffer[index + 1] = (byte)((absAddress >> 8) & 0xFF);
767                 buffer[index + 2] = (byte)(absAddress & 0xFF);
768             }
769             return index + 3;
770         } else {
771             return index;
772         }
773     }
774 
getChildrenPosition(final PtNode ptNode, final FormatOptions formatOptions)775     /* package */ static final int getChildrenPosition(final PtNode ptNode,
776             final FormatOptions formatOptions) {
777         int positionOfChildrenPosField = ptNode.mCachedAddressAfterUpdate
778                 + getNodeHeaderSize(ptNode, formatOptions);
779         if (ptNode.isTerminal()) {
780             // A terminal node has either the terminal id or the frequency.
781             // If positionOfChildrenPosField is incorrect, we may crash when jumping to the children
782             // position.
783             if (formatOptions.mHasTerminalId) {
784                 positionOfChildrenPosField += FormatSpec.PTNODE_TERMINAL_ID_SIZE;
785             } else {
786                 positionOfChildrenPosField += FormatSpec.PTNODE_FREQUENCY_SIZE;
787             }
788         }
789         return null == ptNode.mChildren ? FormatSpec.NO_CHILDREN_ADDRESS
790                 : ptNode.mChildren.mCachedAddressAfterUpdate - positionOfChildrenPosField;
791     }
792 
793     /**
794      * Write a PtNodeArray. The PtNodeArray is expected to have its final position cached.
795      *
796      * @param dict the dictionary the node array is a part of (for relative offsets).
797      * @param dictEncoder the dictionary encoder.
798      * @param ptNodeArray the node array to write.
799      * @param formatOptions file format options.
800      */
801     @SuppressWarnings("unused")
writePlacedPtNodeArray(final FusionDictionary dict, final DictEncoder dictEncoder, final PtNodeArray ptNodeArray, final FormatOptions formatOptions)802     /* package */ static void writePlacedPtNodeArray(final FusionDictionary dict,
803             final DictEncoder dictEncoder, final PtNodeArray ptNodeArray,
804             final FormatOptions formatOptions) {
805         // TODO: Make the code in common with BinaryDictIOUtils#writePtNode
806         dictEncoder.setPosition(ptNodeArray.mCachedAddressAfterUpdate);
807 
808         final int ptNodeCount = ptNodeArray.mData.size();
809         dictEncoder.writePtNodeCount(ptNodeCount);
810         final int parentPosition =
811                 (ptNodeArray.mCachedParentAddress == FormatSpec.NO_PARENT_ADDRESS)
812                 ? FormatSpec.NO_PARENT_ADDRESS
813                 : ptNodeArray.mCachedParentAddress + ptNodeArray.mCachedAddressAfterUpdate;
814         for (int i = 0; i < ptNodeCount; ++i) {
815             final PtNode ptNode = ptNodeArray.mData.get(i);
816             if (dictEncoder.getPosition() != ptNode.mCachedAddressAfterUpdate) {
817                 throw new RuntimeException("Bug: write index is not the same as the cached address "
818                         + "of the node : " + dictEncoder.getPosition() + " <> "
819                         + ptNode.mCachedAddressAfterUpdate);
820             }
821             // Sanity checks.
822             if (DBG && ptNode.mFrequency > FormatSpec.MAX_TERMINAL_FREQUENCY) {
823                 throw new RuntimeException("A node has a frequency > "
824                         + FormatSpec.MAX_TERMINAL_FREQUENCY
825                         + " : " + ptNode.mFrequency);
826             }
827             dictEncoder.writePtNode(ptNode, parentPosition, formatOptions, dict);
828         }
829         if (formatOptions.mSupportsDynamicUpdate) {
830             dictEncoder.writeForwardLinkAddress(FormatSpec.NO_FORWARD_LINK_ADDRESS);
831         }
832         if (dictEncoder.getPosition() != ptNodeArray.mCachedAddressAfterUpdate
833                 + ptNodeArray.mCachedSize) {
834             throw new RuntimeException("Not the same size : written "
835                      + (dictEncoder.getPosition() - ptNodeArray.mCachedAddressAfterUpdate)
836                      + " bytes from a node that should have " + ptNodeArray.mCachedSize + " bytes");
837         }
838     }
839 
840     /**
841      * Dumps a collection of useful statistics about a list of PtNode arrays.
842      *
843      * This prints purely informative stuff, like the total estimated file size, the
844      * number of PtNode arrays, of PtNodes, the repartition of each address size, etc
845      *
846      * @param ptNodeArrays the list of PtNode arrays.
847      */
showStatistics(ArrayList<PtNodeArray> ptNodeArrays)848     /* package */ static void showStatistics(ArrayList<PtNodeArray> ptNodeArrays) {
849         int firstTerminalAddress = Integer.MAX_VALUE;
850         int lastTerminalAddress = Integer.MIN_VALUE;
851         int size = 0;
852         int ptNodes = 0;
853         int maxNodes = 0;
854         int maxRuns = 0;
855         for (final PtNodeArray ptNodeArray : ptNodeArrays) {
856             if (maxNodes < ptNodeArray.mData.size()) maxNodes = ptNodeArray.mData.size();
857             for (final PtNode ptNode : ptNodeArray.mData) {
858                 ++ptNodes;
859                 if (ptNode.mChars.length > maxRuns) maxRuns = ptNode.mChars.length;
860                 if (ptNode.mFrequency >= 0) {
861                     if (ptNodeArray.mCachedAddressAfterUpdate < firstTerminalAddress)
862                         firstTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate;
863                     if (ptNodeArray.mCachedAddressAfterUpdate > lastTerminalAddress)
864                         lastTerminalAddress = ptNodeArray.mCachedAddressAfterUpdate;
865                 }
866             }
867             if (ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize > size) {
868                 size = ptNodeArray.mCachedAddressAfterUpdate + ptNodeArray.mCachedSize;
869             }
870         }
871         final int[] ptNodeCounts = new int[maxNodes + 1];
872         final int[] runCounts = new int[maxRuns + 1];
873         for (final PtNodeArray ptNodeArray : ptNodeArrays) {
874             ++ptNodeCounts[ptNodeArray.mData.size()];
875             for (final PtNode ptNode : ptNodeArray.mData) {
876                 ++runCounts[ptNode.mChars.length];
877             }
878         }
879 
880         MakedictLog.i("Statistics:\n"
881                 + "  total file size " + size + "\n"
882                 + "  " + ptNodeArrays.size() + " node arrays\n"
883                 + "  " + ptNodes + " PtNodes (" + ((float)ptNodes / ptNodeArrays.size())
884                         + " PtNodes per node)\n"
885                 + "  first terminal at " + firstTerminalAddress + "\n"
886                 + "  last terminal at " + lastTerminalAddress + "\n"
887                 + "  PtNode stats : max = " + maxNodes);
888         for (int i = 0; i < ptNodeCounts.length; ++i) {
889             MakedictLog.i("    " + i + " : " + ptNodeCounts[i]);
890         }
891         MakedictLog.i("  Character run stats : max = " + maxRuns);
892         for (int i = 0; i < runCounts.length; ++i) {
893             MakedictLog.i("    " + i + " : " + runCounts[i]);
894         }
895     }
896 
897     /**
898      * Writes a file header to an output stream.
899      *
900      * @param destination the stream to write the file header to.
901      * @param dict the dictionary to write.
902      * @param formatOptions file format options.
903      * @return the size of the header.
904      */
writeDictionaryHeader(final OutputStream destination, final FusionDictionary dict, final FormatOptions formatOptions)905     /* package */ static int writeDictionaryHeader(final OutputStream destination,
906             final FusionDictionary dict, final FormatOptions formatOptions)
907                     throws IOException, UnsupportedFormatException {
908         final int version = formatOptions.mVersion;
909         if (version < FormatSpec.MINIMUM_SUPPORTED_VERSION
910                 || version > FormatSpec.MAXIMUM_SUPPORTED_VERSION) {
911             throw new UnsupportedFormatException("Requested file format version " + version
912                     + ", but this implementation only supports versions "
913                     + FormatSpec.MINIMUM_SUPPORTED_VERSION + " through "
914                     + FormatSpec.MAXIMUM_SUPPORTED_VERSION);
915         }
916 
917         ByteArrayOutputStream headerBuffer = new ByteArrayOutputStream(256);
918 
919         // The magic number in big-endian order.
920         // Magic number for all versions.
921         headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 24)));
922         headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 16)));
923         headerBuffer.write((byte) (0xFF & (FormatSpec.MAGIC_NUMBER >> 8)));
924         headerBuffer.write((byte) (0xFF & FormatSpec.MAGIC_NUMBER));
925         // Dictionary version.
926         headerBuffer.write((byte) (0xFF & (version >> 8)));
927         headerBuffer.write((byte) (0xFF & version));
928 
929         // Options flags
930         final int options = makeOptionsValue(dict, formatOptions);
931         headerBuffer.write((byte) (0xFF & (options >> 8)));
932         headerBuffer.write((byte) (0xFF & options));
933         final int headerSizeOffset = headerBuffer.size();
934         // Placeholder to be written later with header size.
935         for (int i = 0; i < 4; ++i) {
936             headerBuffer.write(0);
937         }
938         // Write out the options.
939         for (final String key : dict.mOptions.mAttributes.keySet()) {
940             final String value = dict.mOptions.mAttributes.get(key);
941             CharEncoding.writeString(headerBuffer, key);
942             CharEncoding.writeString(headerBuffer, value);
943         }
944         final int size = headerBuffer.size();
945         final byte[] bytes = headerBuffer.toByteArray();
946         // Write out the header size.
947         bytes[headerSizeOffset] = (byte) (0xFF & (size >> 24));
948         bytes[headerSizeOffset + 1] = (byte) (0xFF & (size >> 16));
949         bytes[headerSizeOffset + 2] = (byte) (0xFF & (size >> 8));
950         bytes[headerSizeOffset + 3] = (byte) (0xFF & (size >> 0));
951         destination.write(bytes);
952 
953         headerBuffer.close();
954         return size;
955     }
956 }
957