• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5  * use this file except in compliance with the License. You may obtain a copy of
6  * the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13  * License for the specific language governing permissions and limitations under
14  * the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collections;
22 import java.util.HashMap;
23 import java.util.Iterator;
24 import java.util.LinkedList;
25 
26 /**
27  * A dictionary that can fusion heads and tails of words for more compression.
28  */
29 public class FusionDictionary implements Iterable<Word> {
30 
31     private static final boolean DBG = MakedictLog.DBG;
32 
33     /**
34      * A node of the dictionary, containing several CharGroups.
35      *
36      * A node is but an ordered array of CharGroups, which essentially contain all the
37      * real information.
38      * This class also contains fields to cache size and address, to help with binary
39      * generation.
40      */
41     public static class Node {
42         ArrayList<CharGroup> mData;
43         // To help with binary generation
44         int mCachedSize;
45         int mCachedAddress;
Node()46         public Node() {
47             mData = new ArrayList<CharGroup>();
48             mCachedSize = Integer.MIN_VALUE;
49             mCachedAddress = Integer.MIN_VALUE;
50         }
Node(ArrayList<CharGroup> data)51         public Node(ArrayList<CharGroup> data) {
52             mData = data;
53             mCachedSize = Integer.MIN_VALUE;
54             mCachedAddress = Integer.MIN_VALUE;
55         }
56     }
57 
58     /**
59      * A string with a frequency.
60      *
61      * This represents an "attribute", that is either a bigram or a shortcut.
62      */
63     public static class WeightedString {
64         final String mWord;
65         int mFrequency;
WeightedString(String word, int frequency)66         public WeightedString(String word, int frequency) {
67             mWord = word;
68             mFrequency = frequency;
69         }
70 
71         @Override
hashCode()72         public int hashCode() {
73             return Arrays.hashCode(new Object[] { mWord, mFrequency });
74         }
75 
76         @Override
equals(Object o)77         public boolean equals(Object o) {
78             if (o == this) return true;
79             if (!(o instanceof WeightedString)) return false;
80             WeightedString w = (WeightedString)o;
81             return mWord.equals(w.mWord) && mFrequency == w.mFrequency;
82         }
83     }
84 
85     /**
86      * A group of characters, with a frequency, shortcut targets, bigrams, and children.
87      *
88      * This is the central class of the in-memory representation. A CharGroup is what can
89      * be seen as a traditional "trie node", except it can hold several characters at the
90      * same time. A CharGroup essentially represents one or several characters in the middle
91      * of the trie trie; as such, it can be a terminal, and it can have children.
92      * In this in-memory representation, whether the CharGroup is a terminal or not is represented
93      * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other
94      * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or
95      * bigrams, but a non-terminal may not. Moreover, children, if present, are null.
96      */
97     public static class CharGroup {
98         public static final int NOT_A_TERMINAL = -1;
99         final int mChars[];
100         ArrayList<WeightedString> mShortcutTargets;
101         ArrayList<WeightedString> mBigrams;
102         int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
103         Node mChildren;
104         // The two following members to help with binary generation
105         int mCachedSize;
106         int mCachedAddress;
107 
CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final int frequency)108         public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
109                 final ArrayList<WeightedString> bigrams, final int frequency) {
110             mChars = chars;
111             mFrequency = frequency;
112             mShortcutTargets = shortcutTargets;
113             mBigrams = bigrams;
114             mChildren = null;
115         }
116 
CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final int frequency, final Node children)117         public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
118                 final ArrayList<WeightedString> bigrams, final int frequency, final Node children) {
119             mChars = chars;
120             mFrequency = frequency;
121             mShortcutTargets = shortcutTargets;
122             mBigrams = bigrams;
123             mChildren = children;
124         }
125 
addChild(CharGroup n)126         public void addChild(CharGroup n) {
127             if (null == mChildren) {
128                 mChildren = new Node();
129             }
130             mChildren.mData.add(n);
131         }
132 
isTerminal()133         public boolean isTerminal() {
134             return NOT_A_TERMINAL != mFrequency;
135         }
136 
hasSeveralChars()137         public boolean hasSeveralChars() {
138             assert(mChars.length > 0);
139             return 1 < mChars.length;
140         }
141 
142         /**
143          * Adds a word to the bigram list. Updates the frequency if the word already
144          * exists.
145          */
addBigram(final String word, final int frequency)146         public void addBigram(final String word, final int frequency) {
147             if (mBigrams == null) {
148                 mBigrams = new ArrayList<WeightedString>();
149             }
150             WeightedString bigram = getBigram(word);
151             if (bigram != null) {
152                 bigram.mFrequency = frequency;
153             } else {
154                 bigram = new WeightedString(word, frequency);
155                 mBigrams.add(bigram);
156             }
157         }
158 
159         /**
160          * Gets the shortcut target for the given word. Returns null if the word is not in the
161          * shortcut list.
162          */
getShortcut(final String word)163         public WeightedString getShortcut(final String word) {
164             // TODO: Don't do a linear search
165             if (mShortcutTargets != null) {
166                 final int size = mShortcutTargets.size();
167                 for (int i = 0; i < size; ++i) {
168                     WeightedString shortcut = mShortcutTargets.get(i);
169                     if (shortcut.mWord.equals(word)) {
170                         return shortcut;
171                     }
172                 }
173             }
174             return null;
175         }
176 
177         /**
178          * Gets the bigram for the given word.
179          * Returns null if the word is not in the bigrams list.
180          */
getBigram(final String word)181         public WeightedString getBigram(final String word) {
182             // TODO: Don't do a linear search
183             if (mBigrams != null) {
184                 final int size = mBigrams.size();
185                 for (int i = 0; i < size; ++i) {
186                     WeightedString bigram = mBigrams.get(i);
187                     if (bigram.mWord.equals(word)) {
188                         return bigram;
189                     }
190                 }
191             }
192             return null;
193         }
194 
195         /**
196          * Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to
197          * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
198          * updated if they are higher than the existing ones.
199          */
update(int frequency, ArrayList<WeightedString> shortcutTargets, ArrayList<WeightedString> bigrams)200         public void update(int frequency, ArrayList<WeightedString> shortcutTargets,
201                 ArrayList<WeightedString> bigrams) {
202             if (frequency > mFrequency) {
203                 mFrequency = frequency;
204             }
205             if (shortcutTargets != null) {
206                 if (mShortcutTargets == null) {
207                     mShortcutTargets = shortcutTargets;
208                 } else {
209                     final int size = shortcutTargets.size();
210                     for (int i = 0; i < size; ++i) {
211                         final WeightedString shortcut = shortcutTargets.get(i);
212                         final WeightedString existingShortcut = getShortcut(shortcut.mWord);
213                         if (existingShortcut == null) {
214                             mShortcutTargets.add(shortcut);
215                         } else if (existingShortcut.mFrequency < shortcut.mFrequency) {
216                             existingShortcut.mFrequency = shortcut.mFrequency;
217                         }
218                     }
219                 }
220             }
221             if (bigrams != null) {
222                 if (mBigrams == null) {
223                     mBigrams = bigrams;
224                 } else {
225                     final int size = bigrams.size();
226                     for (int i = 0; i < size; ++i) {
227                         final WeightedString bigram = bigrams.get(i);
228                         final WeightedString existingBigram = getBigram(bigram.mWord);
229                         if (existingBigram == null) {
230                             mBigrams.add(bigram);
231                         } else if (existingBigram.mFrequency < bigram.mFrequency) {
232                             existingBigram.mFrequency = bigram.mFrequency;
233                         }
234                     }
235                 }
236             }
237         }
238     }
239 
240     /**
241      * Options global to the dictionary.
242      *
243      * There are no options at the moment, so this class is empty.
244      */
245     public static class DictionaryOptions {
246         public final boolean mGermanUmlautProcessing;
247         public final boolean mFrenchLigatureProcessing;
248         public final HashMap<String, String> mAttributes;
DictionaryOptions(final HashMap<String, String> attributes, final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing)249         public DictionaryOptions(final HashMap<String, String> attributes,
250                 final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) {
251             mAttributes = attributes;
252             mGermanUmlautProcessing = germanUmlautProcessing;
253             mFrenchLigatureProcessing = frenchLigatureProcessing;
254         }
255     }
256 
257     public final DictionaryOptions mOptions;
258     public final Node mRoot;
259 
FusionDictionary(final Node root, final DictionaryOptions options)260     public FusionDictionary(final Node root, final DictionaryOptions options) {
261         mRoot = root;
262         mOptions = options;
263     }
264 
addOptionAttribute(final String key, final String value)265     public void addOptionAttribute(final String key, final String value) {
266         mOptions.mAttributes.put(key, value);
267     }
268 
269     /**
270      * Helper method to convert a String to an int array.
271      */
getCodePoints(final String word)272     static private int[] getCodePoints(final String word) {
273         // TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray,
274         // which is not visible from the makedict package. Factor this code.
275         final char[] characters = word.toCharArray();
276         final int length = characters.length;
277         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
278         int codePoint = Character.codePointAt(characters, 0);
279         int dsti = 0;
280         for (int srci = Character.charCount(codePoint);
281                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
282             codePoints[dsti] = codePoint;
283             codePoint = Character.codePointAt(characters, srci);
284         }
285         codePoints[dsti] = codePoint;
286         return codePoints;
287     }
288 
289     /**
290      * Helper method to add a word as a string.
291      *
292      * This method adds a word to the dictionary with the given frequency. Optional
293      * lists of bigrams and shortcuts can be passed here. For each word inside,
294      * they will be added to the dictionary as necessary.
295      *
296      * @param word the word to add.
297      * @param frequency the frequency of the word, in the range [0..255].
298      * @param shortcutTargets a list of shortcut targets for this word, or null.
299      */
add(final String word, final int frequency, final ArrayList<WeightedString> shortcutTargets)300     public void add(final String word, final int frequency,
301             final ArrayList<WeightedString> shortcutTargets) {
302         add(getCodePoints(word), frequency, shortcutTargets);
303     }
304 
305     /**
306      * Sanity check for a node.
307      *
308      * This method checks that all CharGroups in a node are ordered as expected.
309      * If they are, nothing happens. If they aren't, an exception is thrown.
310      */
checkStack(Node node)311     private void checkStack(Node node) {
312         ArrayList<CharGroup> stack = node.mData;
313         int lastValue = -1;
314         for (int i = 0; i < stack.size(); ++i) {
315             int currentValue = stack.get(i).mChars[0];
316             if (currentValue <= lastValue)
317                 throw new RuntimeException("Invalid stack");
318             else
319                 lastValue = currentValue;
320         }
321     }
322 
323     /**
324      * Helper method to add a new bigram to the dictionary.
325      *
326      * @param word1 the previous word of the context
327      * @param word2 the next word of the context
328      * @param frequency the bigram frequency
329      */
setBigram(final String word1, final String word2, final int frequency)330     public void setBigram(final String word1, final String word2, final int frequency) {
331         CharGroup charGroup = findWordInTree(mRoot, word1);
332         if (charGroup != null) {
333             final CharGroup charGroup2 = findWordInTree(mRoot, word2);
334             if (charGroup2 == null) {
335                 add(getCodePoints(word2), 0, null);
336             }
337             charGroup.addBigram(word2, frequency);
338         } else {
339             throw new RuntimeException("First word of bigram not found");
340         }
341     }
342 
343     /**
344      * Add a word to this dictionary.
345      *
346      * The shortcuts, if any, have to be in the dictionary already. If they aren't,
347      * an exception is thrown.
348      *
349      * @param word the word, as an int array.
350      * @param frequency the frequency of the word, in the range [0..255].
351      * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
352      */
add(final int[] word, final int frequency, final ArrayList<WeightedString> shortcutTargets)353     private void add(final int[] word, final int frequency,
354             final ArrayList<WeightedString> shortcutTargets) {
355         assert(frequency >= 0 && frequency <= 255);
356         Node currentNode = mRoot;
357         int charIndex = 0;
358 
359         CharGroup currentGroup = null;
360         int differentCharIndex = 0; // Set by the loop to the index of the char that differs
361         int nodeIndex = findIndexOfChar(mRoot, word[charIndex]);
362         while (CHARACTER_NOT_FOUND != nodeIndex) {
363             currentGroup = currentNode.mData.get(nodeIndex);
364             differentCharIndex = compareArrays(currentGroup.mChars, word, charIndex);
365             if (ARRAYS_ARE_EQUAL != differentCharIndex
366                     && differentCharIndex < currentGroup.mChars.length) break;
367             if (null == currentGroup.mChildren) break;
368             charIndex += currentGroup.mChars.length;
369             if (charIndex >= word.length) break;
370             currentNode = currentGroup.mChildren;
371             nodeIndex = findIndexOfChar(currentNode, word[charIndex]);
372         }
373 
374         if (-1 == nodeIndex) {
375             // No node at this point to accept the word. Create one.
376             final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
377             final CharGroup newGroup = new CharGroup(
378                     Arrays.copyOfRange(word, charIndex, word.length),
379                     shortcutTargets, null /* bigrams */, frequency);
380             currentNode.mData.add(insertionIndex, newGroup);
381             if (DBG) checkStack(currentNode);
382         } else {
383             // There is a word with a common prefix.
384             if (differentCharIndex == currentGroup.mChars.length) {
385                 if (charIndex + differentCharIndex >= word.length) {
386                     // The new word is a prefix of an existing word, but the node on which it
387                     // should end already exists as is. Since the old CharNode was not a terminal,
388                     // make it one by filling in its frequency and other attributes
389                     currentGroup.update(frequency, shortcutTargets, null);
390                 } else {
391                     // The new word matches the full old word and extends past it.
392                     // We only have to create a new node and add it to the end of this.
393                     final CharGroup newNode = new CharGroup(
394                             Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
395                                     shortcutTargets, null /* bigrams */, frequency);
396                     currentGroup.mChildren = new Node();
397                     currentGroup.mChildren.mData.add(newNode);
398                 }
399             } else {
400                 if (0 == differentCharIndex) {
401                     // Exact same word. Update the frequency if higher. This will also add the
402                     // new shortcuts to the existing shortcut list if it already exists.
403                     currentGroup.update(frequency, shortcutTargets, null);
404                 } else {
405                     // Partial prefix match only. We have to replace the current node with a node
406                     // containing the current prefix and create two new ones for the tails.
407                     Node newChildren = new Node();
408                     final CharGroup newOldWord = new CharGroup(
409                             Arrays.copyOfRange(currentGroup.mChars, differentCharIndex,
410                                     currentGroup.mChars.length), currentGroup.mShortcutTargets,
411                             currentGroup.mBigrams, currentGroup.mFrequency, currentGroup.mChildren);
412                     newChildren.mData.add(newOldWord);
413 
414                     final CharGroup newParent;
415                     if (charIndex + differentCharIndex >= word.length) {
416                         newParent = new CharGroup(
417                                 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
418                                 shortcutTargets, null /* bigrams */, frequency, newChildren);
419                     } else {
420                         newParent = new CharGroup(
421                                 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
422                                 null /* shortcutTargets */, null /* bigrams */, -1, newChildren);
423                         final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
424                                 charIndex + differentCharIndex, word.length),
425                                 shortcutTargets, null /* bigrams */, frequency);
426                         final int addIndex = word[charIndex + differentCharIndex]
427                                 > currentGroup.mChars[differentCharIndex] ? 1 : 0;
428                         newChildren.mData.add(addIndex, newWord);
429                     }
430                     currentNode.mData.set(nodeIndex, newParent);
431                 }
432                 if (DBG) checkStack(currentNode);
433             }
434         }
435     }
436 
437     private static int ARRAYS_ARE_EQUAL = 0;
438 
439     /**
440      * Custom comparison of two int arrays taken to contain character codes.
441      *
442      * This method compares the two arrays passed as an argument in a lexicographic way,
443      * with an offset in the dst string.
444      * This method does NOT test for the first character. It is taken to be equal.
445      * I repeat: this method starts the comparison at 1 <> dstOffset + 1.
446      * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the
447      * strings are equal. This works BECAUSE we don't look at the first character.
448      *
449      * @param src the left-hand side string of the comparison.
450      * @param dst the right-hand side string of the comparison.
451      * @param dstOffset the offset in the right-hand side string.
452      * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't.
453      */
compareArrays(final int[] src, final int[] dst, int dstOffset)454     private static int compareArrays(final int[] src, final int[] dst, int dstOffset) {
455         // We do NOT test the first char, because we come from a method that already
456         // tested it.
457         for (int i = 1; i < src.length; ++i) {
458             if (dstOffset + i >= dst.length) return i;
459             if (src[i] != dst[dstOffset + i]) return i;
460         }
461         if (dst.length > src.length) return src.length;
462         return ARRAYS_ARE_EQUAL;
463     }
464 
465     /**
466      * Helper class that compares and sorts two chargroups according to their
467      * first element only. I repeat: ONLY the first element is considered, the rest
468      * is ignored.
469      * This comparator imposes orderings that are inconsistent with equals.
470      */
471     static private class CharGroupComparator implements java.util.Comparator<CharGroup> {
472         @Override
compare(CharGroup c1, CharGroup c2)473         public int compare(CharGroup c1, CharGroup c2) {
474             if (c1.mChars[0] == c2.mChars[0]) return 0;
475             return c1.mChars[0] < c2.mChars[0] ? -1 : 1;
476         }
477     }
478     final static private CharGroupComparator CHARGROUP_COMPARATOR = new CharGroupComparator();
479 
480     /**
481      * Finds the insertion index of a character within a node.
482      */
findInsertionIndex(final Node node, int character)483     private static int findInsertionIndex(final Node node, int character) {
484         final ArrayList<CharGroup> data = node.mData;
485         final CharGroup reference = new CharGroup(new int[] { character },
486                 null /* shortcutTargets */, null /* bigrams */, 0);
487         int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
488         return result >= 0 ? result : -result - 1;
489     }
490 
491     private static int CHARACTER_NOT_FOUND = -1;
492 
493     /**
494      * Find the index of a char in a node, if it exists.
495      *
496      * @param node the node to search in.
497      * @param character the character to search for.
498      * @return the position of the character if it's there, or CHARACTER_NOT_FOUND = -1 else.
499      */
findIndexOfChar(final Node node, int character)500     private static int findIndexOfChar(final Node node, int character) {
501         final int insertionIndex = findInsertionIndex(node, character);
502         if (node.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND;
503         return character == node.mData.get(insertionIndex).mChars[0] ? insertionIndex
504                 : CHARACTER_NOT_FOUND;
505     }
506 
507     /**
508      * Helper method to find a word in a given branch.
509      */
findWordInTree(Node node, final String s)510     public static CharGroup findWordInTree(Node node, final String s) {
511         int index = 0;
512         final StringBuilder checker = DBG ? new StringBuilder() : null;
513 
514         CharGroup currentGroup;
515         do {
516             int indexOfGroup = findIndexOfChar(node, s.codePointAt(index));
517             if (CHARACTER_NOT_FOUND == indexOfGroup) return null;
518             currentGroup = node.mData.get(indexOfGroup);
519             if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length));
520             index += currentGroup.mChars.length;
521             if (index < s.length()) {
522                 node = currentGroup.mChildren;
523             }
524         } while (null != node && index < s.length());
525 
526         if (DBG && !s.equals(checker.toString())) return null;
527         return currentGroup;
528     }
529 
530     /**
531      * Helper method to find out whether a word is in the dict or not.
532      */
hasWord(final String s)533     public boolean hasWord(final String s) {
534         if (null == s || "".equals(s)) {
535             throw new RuntimeException("Can't search for a null or empty string");
536         }
537         return null != findWordInTree(mRoot, s);
538     }
539 
540     /**
541      * Recursively count the number of character groups in a given branch of the trie.
542      *
543      * @param node the parent node.
544      * @return the number of char groups in all the branch under this node.
545      */
countCharGroups(final Node node)546     public static int countCharGroups(final Node node) {
547         final int nodeSize = node.mData.size();
548         int size = nodeSize;
549         for (int i = nodeSize - 1; i >= 0; --i) {
550             CharGroup group = node.mData.get(i);
551             if (null != group.mChildren)
552                 size += countCharGroups(group.mChildren);
553         }
554         return size;
555     }
556 
557     /**
558      * Recursively count the number of nodes in a given branch of the trie.
559      *
560      * @param node the node to count.
561      * @return the number of nodes in this branch.
562      */
countNodes(final Node node)563     public static int countNodes(final Node node) {
564         int size = 1;
565         for (int i = node.mData.size() - 1; i >= 0; --i) {
566             CharGroup group = node.mData.get(i);
567             if (null != group.mChildren)
568                 size += countNodes(group.mChildren);
569         }
570         return size;
571     }
572 
573     // Recursively find out whether there are any bigrams.
574     // This can be pretty expensive especially if there aren't any (we return as soon
575     // as we find one, so it's much cheaper if there are bigrams)
hasBigramsInternal(final Node node)576     private static boolean hasBigramsInternal(final Node node) {
577         if (null == node) return false;
578         for (int i = node.mData.size() - 1; i >= 0; --i) {
579             CharGroup group = node.mData.get(i);
580             if (null != group.mBigrams) return true;
581             if (hasBigramsInternal(group.mChildren)) return true;
582         }
583         return false;
584     }
585 
586     /**
587      * Finds out whether there are any bigrams in this dictionary.
588      *
589      * @return true if there is any bigram, false otherwise.
590      */
591     // TODO: this is expensive especially for large dictionaries without any bigram.
592     // The up side is, this is always accurate and correct and uses no memory. We should
593     // find a more efficient way of doing this, without compromising too much on memory
594     // and ease of use.
hasBigrams()595     public boolean hasBigrams() {
596         return hasBigramsInternal(mRoot);
597     }
598 
599     // Historically, the tails of the words were going to be merged to save space.
600     // However, that would prevent the code to search for a specific address in log(n)
601     // time so this was abandoned.
602     // The code is still of interest as it does add some compression to any dictionary
603     // that has no need for attributes. Implementations that does not read attributes should be
604     // able to read a dictionary with merged tails.
605     // Also, the following code does support frequencies, as in, it will only merges
606     // tails that share the same frequency. Though it would result in the above loss of
607     // performance while searching by address, it is still technically possible to merge
608     // tails that contain attributes, but this code does not take that into account - it does
609     // not compare attributes and will merge terminals with different attributes regardless.
mergeTails()610     public void mergeTails() {
611         MakedictLog.i("Do not merge tails");
612         return;
613 
614 //        MakedictLog.i("Merging nodes. Number of nodes : " + countNodes(root));
615 //        MakedictLog.i("Number of groups : " + countCharGroups(root));
616 //
617 //        final HashMap<String, ArrayList<Node>> repository =
618 //                  new HashMap<String, ArrayList<Node>>();
619 //        mergeTailsInner(repository, root);
620 //
621 //        MakedictLog.i("Number of different pseudohashes : " + repository.size());
622 //        int size = 0;
623 //        for (ArrayList<Node> a : repository.values()) {
624 //            size += a.size();
625 //        }
626 //        MakedictLog.i("Number of nodes after merge : " + (1 + size));
627 //        MakedictLog.i("Recursively seen nodes : " + countNodes(root));
628     }
629 
630     // The following methods are used by the deactivated mergeTails()
631 //   private static boolean isEqual(Node a, Node b) {
632 //       if (null == a && null == b) return true;
633 //       if (null == a || null == b) return false;
634 //       if (a.data.size() != b.data.size()) return false;
635 //       final int size = a.data.size();
636 //       for (int i = size - 1; i >= 0; --i) {
637 //           CharGroup aGroup = a.data.get(i);
638 //           CharGroup bGroup = b.data.get(i);
639 //           if (aGroup.frequency != bGroup.frequency) return false;
640 //           if (aGroup.alternates == null && bGroup.alternates != null) return false;
641 //           if (aGroup.alternates != null && !aGroup.equals(bGroup.alternates)) return false;
642 //           if (!Arrays.equals(aGroup.chars, bGroup.chars)) return false;
643 //           if (!isEqual(aGroup.children, bGroup.children)) return false;
644 //       }
645 //       return true;
646 //   }
647 
648 //   static private HashMap<String, ArrayList<Node>> mergeTailsInner(
649 //           final HashMap<String, ArrayList<Node>> map, final Node node) {
650 //       final ArrayList<CharGroup> branches = node.data;
651 //       final int nodeSize = branches.size();
652 //       for (int i = 0; i < nodeSize; ++i) {
653 //           CharGroup group = branches.get(i);
654 //           if (null != group.children) {
655 //               String pseudoHash = getPseudoHash(group.children);
656 //               ArrayList<Node> similarList = map.get(pseudoHash);
657 //               if (null == similarList) {
658 //                   similarList = new ArrayList<Node>();
659 //                   map.put(pseudoHash, similarList);
660 //               }
661 //               boolean merged = false;
662 //               for (Node similar : similarList) {
663 //                   if (isEqual(group.children, similar)) {
664 //                       group.children = similar;
665 //                       merged = true;
666 //                       break;
667 //                   }
668 //               }
669 //               if (!merged) {
670 //                   similarList.add(group.children);
671 //               }
672 //               mergeTailsInner(map, group.children);
673 //           }
674 //       }
675 //       return map;
676 //   }
677 
678 //  private static String getPseudoHash(final Node node) {
679 //      StringBuilder s = new StringBuilder();
680 //      for (CharGroup g : node.data) {
681 //          s.append(g.frequency);
682 //          for (int ch : g.chars){
683 //              s.append(Character.toChars(ch));
684 //          }
685 //      }
686 //      return s.toString();
687 //  }
688 
689     /**
690      * Iterator to walk through a dictionary.
691      *
692      * This is purely for convenience.
693      */
694     public static class DictionaryIterator implements Iterator<Word> {
695 
696         private static class Position {
697             public Iterator<CharGroup> pos;
698             public int length;
Position(ArrayList<CharGroup> groups)699             public Position(ArrayList<CharGroup> groups) {
700                 pos = groups.iterator();
701                 length = 0;
702             }
703         }
704         final StringBuilder mCurrentString;
705         final LinkedList<Position> mPositions;
706 
DictionaryIterator(ArrayList<CharGroup> root)707         public DictionaryIterator(ArrayList<CharGroup> root) {
708             mCurrentString = new StringBuilder();
709             mPositions = new LinkedList<Position>();
710             final Position rootPos = new Position(root);
711             mPositions.add(rootPos);
712         }
713 
714         @Override
hasNext()715         public boolean hasNext() {
716             for (Position p : mPositions) {
717                 if (p.pos.hasNext()) {
718                     return true;
719                 }
720             }
721             return false;
722         }
723 
724         @Override
next()725         public Word next() {
726             Position currentPos = mPositions.getLast();
727             mCurrentString.setLength(mCurrentString.length() - currentPos.length);
728 
729             do {
730                 if (currentPos.pos.hasNext()) {
731                     final CharGroup currentGroup = currentPos.pos.next();
732                     currentPos.length = currentGroup.mChars.length;
733                     for (int i : currentGroup.mChars)
734                         mCurrentString.append(Character.toChars(i));
735                     if (null != currentGroup.mChildren) {
736                         currentPos = new Position(currentGroup.mChildren.mData);
737                         mPositions.addLast(currentPos);
738                     }
739                     if (currentGroup.mFrequency >= 0)
740                         return new Word(mCurrentString.toString(), currentGroup.mFrequency,
741                                 currentGroup.mShortcutTargets, currentGroup.mBigrams);
742                 } else {
743                     mPositions.removeLast();
744                     currentPos = mPositions.getLast();
745                     mCurrentString.setLength(mCurrentString.length() - mPositions.getLast().length);
746                 }
747             } while(true);
748         }
749 
750         @Override
remove()751         public void remove() {
752             throw new UnsupportedOperationException("Unsupported yet");
753         }
754 
755     }
756 
757     /**
758      * Method to return an iterator.
759      *
760      * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x
761      * and say : for (Word w : x) {}
762      */
763     @Override
iterator()764     public Iterator<Word> iterator() {
765         return new DictionaryIterator(mRoot.mData);
766     }
767 }
768