• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
5  * use this file except in compliance with the License. You may obtain a copy of
6  * the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
13  * License for the specific language governing permissions and limitations under
14  * the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.latin.Constants;
20 
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.HashMap;
25 import java.util.Iterator;
26 import java.util.LinkedList;
27 
28 /**
29  * A dictionary that can fusion heads and tails of words for more compression.
30  */
31 public final class FusionDictionary implements Iterable<Word> {
32     private static final boolean DBG = MakedictLog.DBG;
33 
34     /**
35      * A node of the dictionary, containing several CharGroups.
36      *
37      * A node is but an ordered array of CharGroups, which essentially contain all the
38      * real information.
39      * This class also contains fields to cache size and address, to help with binary
40      * generation.
41      */
42     public static final class Node {
43         ArrayList<CharGroup> mData;
44         // To help with binary generation
45         int mCachedSize = Integer.MIN_VALUE;
46         int mCachedAddress = Integer.MIN_VALUE;
47         int mCachedParentAddress = 0;
48 
Node()49         public Node() {
50             mData = new ArrayList<CharGroup>();
51         }
Node(ArrayList<CharGroup> data)52         public Node(ArrayList<CharGroup> data) {
53             mData = data;
54         }
55     }
56 
57     /**
58      * A string with a frequency.
59      *
60      * This represents an "attribute", that is either a bigram or a shortcut.
61      */
62     public static final class WeightedString {
63         public final String mWord;
64         public int mFrequency;
WeightedString(String word, int frequency)65         public WeightedString(String word, int frequency) {
66             mWord = word;
67             mFrequency = frequency;
68         }
69 
70         @Override
hashCode()71         public int hashCode() {
72             return Arrays.hashCode(new Object[] { mWord, mFrequency });
73         }
74 
75         @Override
equals(Object o)76         public boolean equals(Object o) {
77             if (o == this) return true;
78             if (!(o instanceof WeightedString)) return false;
79             WeightedString w = (WeightedString)o;
80             return mWord.equals(w.mWord) && mFrequency == w.mFrequency;
81         }
82     }
83 
84     /**
85      * A group of characters, with a frequency, shortcut targets, bigrams, and children.
86      *
87      * This is the central class of the in-memory representation. A CharGroup is what can
88      * be seen as a traditional "trie node", except it can hold several characters at the
89      * same time. A CharGroup essentially represents one or several characters in the middle
90      * of the trie trie; as such, it can be a terminal, and it can have children.
91      * In this in-memory representation, whether the CharGroup is a terminal or not is represented
92      * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other
93      * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or
94      * bigrams, but a non-terminal may not. Moreover, children, if present, are null.
95      */
96     public static final class CharGroup {
97         public static final int NOT_A_TERMINAL = -1;
98         final int mChars[];
99         ArrayList<WeightedString> mShortcutTargets;
100         ArrayList<WeightedString> mBigrams;
101         int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
102         Node mChildren;
103         boolean mIsNotAWord; // Only a shortcut
104         boolean mIsBlacklistEntry;
105         // The two following members to help with binary generation
106         int mCachedSize;
107         int mCachedAddress;
108 
CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final int frequency, final boolean isNotAWord, final boolean isBlacklistEntry)109         public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
110                 final ArrayList<WeightedString> bigrams, final int frequency,
111                 final boolean isNotAWord, final boolean isBlacklistEntry) {
112             mChars = chars;
113             mFrequency = frequency;
114             mShortcutTargets = shortcutTargets;
115             mBigrams = bigrams;
116             mChildren = null;
117             mIsNotAWord = isNotAWord;
118             mIsBlacklistEntry = isBlacklistEntry;
119         }
120 
CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final int frequency, final boolean isNotAWord, final boolean isBlacklistEntry, final Node children)121         public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
122                 final ArrayList<WeightedString> bigrams, final int frequency,
123                 final boolean isNotAWord, final boolean isBlacklistEntry, final Node children) {
124             mChars = chars;
125             mFrequency = frequency;
126             mShortcutTargets = shortcutTargets;
127             mBigrams = bigrams;
128             mChildren = children;
129             mIsNotAWord = isNotAWord;
130             mIsBlacklistEntry = isBlacklistEntry;
131         }
132 
addChild(CharGroup n)133         public void addChild(CharGroup n) {
134             if (null == mChildren) {
135                 mChildren = new Node();
136             }
137             mChildren.mData.add(n);
138         }
139 
isTerminal()140         public boolean isTerminal() {
141             return NOT_A_TERMINAL != mFrequency;
142         }
143 
hasSeveralChars()144         public boolean hasSeveralChars() {
145             assert(mChars.length > 0);
146             return 1 < mChars.length;
147         }
148 
149         /**
150          * Adds a word to the bigram list. Updates the frequency if the word already
151          * exists.
152          */
addBigram(final String word, final int frequency)153         public void addBigram(final String word, final int frequency) {
154             if (mBigrams == null) {
155                 mBigrams = new ArrayList<WeightedString>();
156             }
157             WeightedString bigram = getBigram(word);
158             if (bigram != null) {
159                 bigram.mFrequency = frequency;
160             } else {
161                 bigram = new WeightedString(word, frequency);
162                 mBigrams.add(bigram);
163             }
164         }
165 
166         /**
167          * Gets the shortcut target for the given word. Returns null if the word is not in the
168          * shortcut list.
169          */
getShortcut(final String word)170         public WeightedString getShortcut(final String word) {
171             // TODO: Don't do a linear search
172             if (mShortcutTargets != null) {
173                 final int size = mShortcutTargets.size();
174                 for (int i = 0; i < size; ++i) {
175                     WeightedString shortcut = mShortcutTargets.get(i);
176                     if (shortcut.mWord.equals(word)) {
177                         return shortcut;
178                     }
179                 }
180             }
181             return null;
182         }
183 
184         /**
185          * Gets the bigram for the given word.
186          * Returns null if the word is not in the bigrams list.
187          */
getBigram(final String word)188         public WeightedString getBigram(final String word) {
189             // TODO: Don't do a linear search
190             if (mBigrams != null) {
191                 final int size = mBigrams.size();
192                 for (int i = 0; i < size; ++i) {
193                     WeightedString bigram = mBigrams.get(i);
194                     if (bigram.mWord.equals(word)) {
195                         return bigram;
196                     }
197                 }
198             }
199             return null;
200         }
201 
202         /**
203          * Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to
204          * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
205          * updated if they are higher than the existing ones.
206          */
update(final int frequency, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final boolean isNotAWord, final boolean isBlacklistEntry)207         public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets,
208                 final ArrayList<WeightedString> bigrams,
209                 final boolean isNotAWord, final boolean isBlacklistEntry) {
210             if (frequency > mFrequency) {
211                 mFrequency = frequency;
212             }
213             if (shortcutTargets != null) {
214                 if (mShortcutTargets == null) {
215                     mShortcutTargets = shortcutTargets;
216                 } else {
217                     final int size = shortcutTargets.size();
218                     for (int i = 0; i < size; ++i) {
219                         final WeightedString shortcut = shortcutTargets.get(i);
220                         final WeightedString existingShortcut = getShortcut(shortcut.mWord);
221                         if (existingShortcut == null) {
222                             mShortcutTargets.add(shortcut);
223                         } else if (existingShortcut.mFrequency < shortcut.mFrequency) {
224                             existingShortcut.mFrequency = shortcut.mFrequency;
225                         }
226                     }
227                 }
228             }
229             if (bigrams != null) {
230                 if (mBigrams == null) {
231                     mBigrams = bigrams;
232                 } else {
233                     final int size = bigrams.size();
234                     for (int i = 0; i < size; ++i) {
235                         final WeightedString bigram = bigrams.get(i);
236                         final WeightedString existingBigram = getBigram(bigram.mWord);
237                         if (existingBigram == null) {
238                             mBigrams.add(bigram);
239                         } else if (existingBigram.mFrequency < bigram.mFrequency) {
240                             existingBigram.mFrequency = bigram.mFrequency;
241                         }
242                     }
243                 }
244             }
245             mIsNotAWord = isNotAWord;
246             mIsBlacklistEntry = isBlacklistEntry;
247         }
248     }
249 
250     /**
251      * Options global to the dictionary.
252      *
253      * There are no options at the moment, so this class is empty.
254      */
255     public static final class DictionaryOptions {
256         public final boolean mGermanUmlautProcessing;
257         public final boolean mFrenchLigatureProcessing;
258         public final HashMap<String, String> mAttributes;
DictionaryOptions(final HashMap<String, String> attributes, final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing)259         public DictionaryOptions(final HashMap<String, String> attributes,
260                 final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) {
261             mAttributes = attributes;
262             mGermanUmlautProcessing = germanUmlautProcessing;
263             mFrenchLigatureProcessing = frenchLigatureProcessing;
264         }
265     }
266 
267     public final DictionaryOptions mOptions;
268     public final Node mRoot;
269 
FusionDictionary(final Node root, final DictionaryOptions options)270     public FusionDictionary(final Node root, final DictionaryOptions options) {
271         mRoot = root;
272         mOptions = options;
273     }
274 
addOptionAttribute(final String key, final String value)275     public void addOptionAttribute(final String key, final String value) {
276         mOptions.mAttributes.put(key, value);
277     }
278 
279     /**
280      * Helper method to convert a String to an int array.
281      */
getCodePoints(final String word)282     static private int[] getCodePoints(final String word) {
283         // TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray,
284         // which is not visible from the makedict package. Factor this code.
285         final char[] characters = word.toCharArray();
286         final int length = characters.length;
287         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
288         int codePoint = Character.codePointAt(characters, 0);
289         int dsti = 0;
290         for (int srci = Character.charCount(codePoint);
291                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
292             codePoints[dsti] = codePoint;
293             codePoint = Character.codePointAt(characters, srci);
294         }
295         codePoints[dsti] = codePoint;
296         return codePoints;
297     }
298 
299     /**
300      * Helper method to add a word as a string.
301      *
302      * This method adds a word to the dictionary with the given frequency. Optional
303      * lists of bigrams and shortcuts can be passed here. For each word inside,
304      * they will be added to the dictionary as necessary.
305      *
306      * @param word the word to add.
307      * @param frequency the frequency of the word, in the range [0..255].
308      * @param shortcutTargets a list of shortcut targets for this word, or null.
309      * @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
310      */
add(final String word, final int frequency, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord)311     public void add(final String word, final int frequency,
312             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
313         add(getCodePoints(word), frequency, shortcutTargets, isNotAWord,
314                 false /* isBlacklistEntry */);
315     }
316 
317     /**
318      * Helper method to add a blacklist entry as a string.
319      *
320      * @param word the word to add as a blacklist entry.
321      * @param shortcutTargets a list of shortcut targets for this word, or null.
322      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
323      */
addBlacklistEntry(final String word, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord)324     public void addBlacklistEntry(final String word,
325             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
326         add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */);
327     }
328 
329     /**
330      * Sanity check for a node.
331      *
332      * This method checks that all CharGroups in a node are ordered as expected.
333      * If they are, nothing happens. If they aren't, an exception is thrown.
334      */
checkStack(Node node)335     private void checkStack(Node node) {
336         ArrayList<CharGroup> stack = node.mData;
337         int lastValue = -1;
338         for (int i = 0; i < stack.size(); ++i) {
339             int currentValue = stack.get(i).mChars[0];
340             if (currentValue <= lastValue)
341                 throw new RuntimeException("Invalid stack");
342             else
343                 lastValue = currentValue;
344         }
345     }
346 
347     /**
348      * Helper method to add a new bigram to the dictionary.
349      *
350      * @param word1 the previous word of the context
351      * @param word2 the next word of the context
352      * @param frequency the bigram frequency
353      */
setBigram(final String word1, final String word2, final int frequency)354     public void setBigram(final String word1, final String word2, final int frequency) {
355         CharGroup charGroup = findWordInTree(mRoot, word1);
356         if (charGroup != null) {
357             final CharGroup charGroup2 = findWordInTree(mRoot, word2);
358             if (charGroup2 == null) {
359                 add(getCodePoints(word2), 0, null, false /* isNotAWord */,
360                         false /* isBlacklistEntry */);
361             }
362             charGroup.addBigram(word2, frequency);
363         } else {
364             throw new RuntimeException("First word of bigram not found");
365         }
366     }
367 
368     /**
369      * Add a word to this dictionary.
370      *
371      * The shortcuts, if any, have to be in the dictionary already. If they aren't,
372      * an exception is thrown.
373      *
374      * @param word the word, as an int array.
375      * @param frequency the frequency of the word, in the range [0..255].
376      * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
377      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
378      * @param isBlacklistEntry true if this is a blacklisted word, false otherwise
379      */
add(final int[] word, final int frequency, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord, final boolean isBlacklistEntry)380     private void add(final int[] word, final int frequency,
381             final ArrayList<WeightedString> shortcutTargets,
382             final boolean isNotAWord, final boolean isBlacklistEntry) {
383         assert(frequency >= 0 && frequency <= 255);
384         if (word.length >= Constants.Dictionary.MAX_WORD_LENGTH) {
385             MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
386             return;
387         }
388 
389         Node currentNode = mRoot;
390         int charIndex = 0;
391 
392         CharGroup currentGroup = null;
393         int differentCharIndex = 0; // Set by the loop to the index of the char that differs
394         int nodeIndex = findIndexOfChar(mRoot, word[charIndex]);
395         while (CHARACTER_NOT_FOUND != nodeIndex) {
396             currentGroup = currentNode.mData.get(nodeIndex);
397             differentCharIndex = compareArrays(currentGroup.mChars, word, charIndex);
398             if (ARRAYS_ARE_EQUAL != differentCharIndex
399                     && differentCharIndex < currentGroup.mChars.length) break;
400             if (null == currentGroup.mChildren) break;
401             charIndex += currentGroup.mChars.length;
402             if (charIndex >= word.length) break;
403             currentNode = currentGroup.mChildren;
404             nodeIndex = findIndexOfChar(currentNode, word[charIndex]);
405         }
406 
407         if (-1 == nodeIndex) {
408             // No node at this point to accept the word. Create one.
409             final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]);
410             final CharGroup newGroup = new CharGroup(
411                     Arrays.copyOfRange(word, charIndex, word.length),
412                     shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry);
413             currentNode.mData.add(insertionIndex, newGroup);
414             if (DBG) checkStack(currentNode);
415         } else {
416             // There is a word with a common prefix.
417             if (differentCharIndex == currentGroup.mChars.length) {
418                 if (charIndex + differentCharIndex >= word.length) {
419                     // The new word is a prefix of an existing word, but the node on which it
420                     // should end already exists as is. Since the old CharNode was not a terminal,
421                     // make it one by filling in its frequency and other attributes
422                     currentGroup.update(frequency, shortcutTargets, null, isNotAWord,
423                             isBlacklistEntry);
424                 } else {
425                     // The new word matches the full old word and extends past it.
426                     // We only have to create a new node and add it to the end of this.
427                     final CharGroup newNode = new CharGroup(
428                             Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
429                                     shortcutTargets, null /* bigrams */, frequency, isNotAWord,
430                                     isBlacklistEntry);
431                     currentGroup.mChildren = new Node();
432                     currentGroup.mChildren.mData.add(newNode);
433                 }
434             } else {
435                 if (0 == differentCharIndex) {
436                     // Exact same word. Update the frequency if higher. This will also add the
437                     // new shortcuts to the existing shortcut list if it already exists.
438                     currentGroup.update(frequency, shortcutTargets, null,
439                             currentGroup.mIsNotAWord && isNotAWord,
440                             currentGroup.mIsBlacklistEntry || isBlacklistEntry);
441                 } else {
442                     // Partial prefix match only. We have to replace the current node with a node
443                     // containing the current prefix and create two new ones for the tails.
444                     Node newChildren = new Node();
445                     final CharGroup newOldWord = new CharGroup(
446                             Arrays.copyOfRange(currentGroup.mChars, differentCharIndex,
447                                     currentGroup.mChars.length), currentGroup.mShortcutTargets,
448                             currentGroup.mBigrams, currentGroup.mFrequency,
449                             currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry,
450                             currentGroup.mChildren);
451                     newChildren.mData.add(newOldWord);
452 
453                     final CharGroup newParent;
454                     if (charIndex + differentCharIndex >= word.length) {
455                         newParent = new CharGroup(
456                                 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
457                                 shortcutTargets, null /* bigrams */, frequency,
458                                 isNotAWord, isBlacklistEntry, newChildren);
459                     } else {
460                         newParent = new CharGroup(
461                                 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex),
462                                 null /* shortcutTargets */, null /* bigrams */, -1,
463                                 false /* isNotAWord */, false /* isBlacklistEntry */, newChildren);
464                         final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word,
465                                 charIndex + differentCharIndex, word.length),
466                                 shortcutTargets, null /* bigrams */, frequency,
467                                 isNotAWord, isBlacklistEntry);
468                         final int addIndex = word[charIndex + differentCharIndex]
469                                 > currentGroup.mChars[differentCharIndex] ? 1 : 0;
470                         newChildren.mData.add(addIndex, newWord);
471                     }
472                     currentNode.mData.set(nodeIndex, newParent);
473                 }
474                 if (DBG) checkStack(currentNode);
475             }
476         }
477     }
478 
479     private static int ARRAYS_ARE_EQUAL = 0;
480 
481     /**
482      * Custom comparison of two int arrays taken to contain character codes.
483      *
484      * This method compares the two arrays passed as an argument in a lexicographic way,
485      * with an offset in the dst string.
486      * This method does NOT test for the first character. It is taken to be equal.
487      * I repeat: this method starts the comparison at 1 <> dstOffset + 1.
488      * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the
489      * strings are equal. This works BECAUSE we don't look at the first character.
490      *
491      * @param src the left-hand side string of the comparison.
492      * @param dst the right-hand side string of the comparison.
493      * @param dstOffset the offset in the right-hand side string.
494      * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't.
495      */
compareArrays(final int[] src, final int[] dst, int dstOffset)496     private static int compareArrays(final int[] src, final int[] dst, int dstOffset) {
497         // We do NOT test the first char, because we come from a method that already
498         // tested it.
499         for (int i = 1; i < src.length; ++i) {
500             if (dstOffset + i >= dst.length) return i;
501             if (src[i] != dst[dstOffset + i]) return i;
502         }
503         if (dst.length > src.length) return src.length;
504         return ARRAYS_ARE_EQUAL;
505     }
506 
507     /**
508      * Helper class that compares and sorts two chargroups according to their
509      * first element only. I repeat: ONLY the first element is considered, the rest
510      * is ignored.
511      * This comparator imposes orderings that are inconsistent with equals.
512      */
513     static private final class CharGroupComparator implements java.util.Comparator<CharGroup> {
514         @Override
compare(CharGroup c1, CharGroup c2)515         public int compare(CharGroup c1, CharGroup c2) {
516             if (c1.mChars[0] == c2.mChars[0]) return 0;
517             return c1.mChars[0] < c2.mChars[0] ? -1 : 1;
518         }
519     }
520     final static private CharGroupComparator CHARGROUP_COMPARATOR = new CharGroupComparator();
521 
522     /**
523      * Finds the insertion index of a character within a node.
524      */
findInsertionIndex(final Node node, int character)525     private static int findInsertionIndex(final Node node, int character) {
526         final ArrayList<CharGroup> data = node.mData;
527         final CharGroup reference = new CharGroup(new int[] { character },
528                 null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */,
529                 false /* isBlacklistEntry */);
530         int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR);
531         return result >= 0 ? result : -result - 1;
532     }
533 
534     private static int CHARACTER_NOT_FOUND = -1;
535 
536     /**
537      * Find the index of a char in a node, if it exists.
538      *
539      * @param node the node to search in.
540      * @param character the character to search for.
541      * @return the position of the character if it's there, or CHARACTER_NOT_FOUND = -1 else.
542      */
findIndexOfChar(final Node node, int character)543     private static int findIndexOfChar(final Node node, int character) {
544         final int insertionIndex = findInsertionIndex(node, character);
545         if (node.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND;
546         return character == node.mData.get(insertionIndex).mChars[0] ? insertionIndex
547                 : CHARACTER_NOT_FOUND;
548     }
549 
550     /**
551      * Helper method to find a word in a given branch.
552      */
findWordInTree(Node node, final String s)553     public static CharGroup findWordInTree(Node node, final String s) {
554         int index = 0;
555         final StringBuilder checker = DBG ? new StringBuilder() : null;
556 
557         CharGroup currentGroup;
558         final int codePointCountInS = s.codePointCount(0, s.length());
559         do {
560             int indexOfGroup = findIndexOfChar(node, s.codePointAt(index));
561             if (CHARACTER_NOT_FOUND == indexOfGroup) return null;
562             currentGroup = node.mData.get(indexOfGroup);
563 
564             if (s.length() - index < currentGroup.mChars.length) return null;
565             int newIndex = index;
566             while (newIndex < s.length() && newIndex - index < currentGroup.mChars.length) {
567                 if (currentGroup.mChars[newIndex - index] != s.codePointAt(newIndex)) return null;
568                 newIndex++;
569             }
570             index = newIndex;
571 
572             if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length));
573             if (index < codePointCountInS) {
574                 node = currentGroup.mChildren;
575             }
576         } while (null != node && index < codePointCountInS);
577 
578         if (index < codePointCountInS) return null;
579         if (!currentGroup.isTerminal()) return null;
580         if (DBG && !s.equals(checker.toString())) return null;
581         return currentGroup;
582     }
583 
584     /**
585      * Helper method to find out whether a word is in the dict or not.
586      */
hasWord(final String s)587     public boolean hasWord(final String s) {
588         if (null == s || "".equals(s)) {
589             throw new RuntimeException("Can't search for a null or empty string");
590         }
591         return null != findWordInTree(mRoot, s);
592     }
593 
594     /**
595      * Recursively count the number of character groups in a given branch of the trie.
596      *
597      * @param node the parent node.
598      * @return the number of char groups in all the branch under this node.
599      */
countCharGroups(final Node node)600     public static int countCharGroups(final Node node) {
601         final int nodeSize = node.mData.size();
602         int size = nodeSize;
603         for (int i = nodeSize - 1; i >= 0; --i) {
604             CharGroup group = node.mData.get(i);
605             if (null != group.mChildren)
606                 size += countCharGroups(group.mChildren);
607         }
608         return size;
609     }
610 
611     /**
612      * Recursively count the number of nodes in a given branch of the trie.
613      *
614      * @param node the node to count.
615      * @return the number of nodes in this branch.
616      */
countNodes(final Node node)617     public static int countNodes(final Node node) {
618         int size = 1;
619         for (int i = node.mData.size() - 1; i >= 0; --i) {
620             CharGroup group = node.mData.get(i);
621             if (null != group.mChildren)
622                 size += countNodes(group.mChildren);
623         }
624         return size;
625     }
626 
627     // Recursively find out whether there are any bigrams.
628     // This can be pretty expensive especially if there aren't any (we return as soon
629     // as we find one, so it's much cheaper if there are bigrams)
hasBigramsInternal(final Node node)630     private static boolean hasBigramsInternal(final Node node) {
631         if (null == node) return false;
632         for (int i = node.mData.size() - 1; i >= 0; --i) {
633             CharGroup group = node.mData.get(i);
634             if (null != group.mBigrams) return true;
635             if (hasBigramsInternal(group.mChildren)) return true;
636         }
637         return false;
638     }
639 
640     /**
641      * Finds out whether there are any bigrams in this dictionary.
642      *
643      * @return true if there is any bigram, false otherwise.
644      */
645     // TODO: this is expensive especially for large dictionaries without any bigram.
646     // The up side is, this is always accurate and correct and uses no memory. We should
647     // find a more efficient way of doing this, without compromising too much on memory
648     // and ease of use.
hasBigrams()649     public boolean hasBigrams() {
650         return hasBigramsInternal(mRoot);
651     }
652 
653     // Historically, the tails of the words were going to be merged to save space.
654     // However, that would prevent the code to search for a specific address in log(n)
655     // time so this was abandoned.
656     // The code is still of interest as it does add some compression to any dictionary
657     // that has no need for attributes. Implementations that does not read attributes should be
658     // able to read a dictionary with merged tails.
659     // Also, the following code does support frequencies, as in, it will only merges
660     // tails that share the same frequency. Though it would result in the above loss of
661     // performance while searching by address, it is still technically possible to merge
662     // tails that contain attributes, but this code does not take that into account - it does
663     // not compare attributes and will merge terminals with different attributes regardless.
mergeTails()664     public void mergeTails() {
665         MakedictLog.i("Do not merge tails");
666         return;
667 
668 //        MakedictLog.i("Merging nodes. Number of nodes : " + countNodes(root));
669 //        MakedictLog.i("Number of groups : " + countCharGroups(root));
670 //
671 //        final HashMap<String, ArrayList<Node>> repository =
672 //                  new HashMap<String, ArrayList<Node>>();
673 //        mergeTailsInner(repository, root);
674 //
675 //        MakedictLog.i("Number of different pseudohashes : " + repository.size());
676 //        int size = 0;
677 //        for (ArrayList<Node> a : repository.values()) {
678 //            size += a.size();
679 //        }
680 //        MakedictLog.i("Number of nodes after merge : " + (1 + size));
681 //        MakedictLog.i("Recursively seen nodes : " + countNodes(root));
682     }
683 
684     // The following methods are used by the deactivated mergeTails()
685 //   private static boolean isEqual(Node a, Node b) {
686 //       if (null == a && null == b) return true;
687 //       if (null == a || null == b) return false;
688 //       if (a.data.size() != b.data.size()) return false;
689 //       final int size = a.data.size();
690 //       for (int i = size - 1; i >= 0; --i) {
691 //           CharGroup aGroup = a.data.get(i);
692 //           CharGroup bGroup = b.data.get(i);
693 //           if (aGroup.frequency != bGroup.frequency) return false;
694 //           if (aGroup.alternates == null && bGroup.alternates != null) return false;
695 //           if (aGroup.alternates != null && !aGroup.equals(bGroup.alternates)) return false;
696 //           if (!Arrays.equals(aGroup.chars, bGroup.chars)) return false;
697 //           if (!isEqual(aGroup.children, bGroup.children)) return false;
698 //       }
699 //       return true;
700 //   }
701 
702 //   static private HashMap<String, ArrayList<Node>> mergeTailsInner(
703 //           final HashMap<String, ArrayList<Node>> map, final Node node) {
704 //       final ArrayList<CharGroup> branches = node.data;
705 //       final int nodeSize = branches.size();
706 //       for (int i = 0; i < nodeSize; ++i) {
707 //           CharGroup group = branches.get(i);
708 //           if (null != group.children) {
709 //               String pseudoHash = getPseudoHash(group.children);
710 //               ArrayList<Node> similarList = map.get(pseudoHash);
711 //               if (null == similarList) {
712 //                   similarList = new ArrayList<Node>();
713 //                   map.put(pseudoHash, similarList);
714 //               }
715 //               boolean merged = false;
716 //               for (Node similar : similarList) {
717 //                   if (isEqual(group.children, similar)) {
718 //                       group.children = similar;
719 //                       merged = true;
720 //                       break;
721 //                   }
722 //               }
723 //               if (!merged) {
724 //                   similarList.add(group.children);
725 //               }
726 //               mergeTailsInner(map, group.children);
727 //           }
728 //       }
729 //       return map;
730 //   }
731 
732 //  private static String getPseudoHash(final Node node) {
733 //      StringBuilder s = new StringBuilder();
734 //      for (CharGroup g : node.data) {
735 //          s.append(g.frequency);
736 //          for (int ch : g.chars) {
737 //              s.append(Character.toChars(ch));
738 //          }
739 //      }
740 //      return s.toString();
741 //  }
742 
743     /**
744      * Iterator to walk through a dictionary.
745      *
746      * This is purely for convenience.
747      */
748     public static final class DictionaryIterator implements Iterator<Word> {
749         private static final class Position {
750             public Iterator<CharGroup> pos;
751             public int length;
Position(ArrayList<CharGroup> groups)752             public Position(ArrayList<CharGroup> groups) {
753                 pos = groups.iterator();
754                 length = 0;
755             }
756         }
757         final StringBuilder mCurrentString;
758         final LinkedList<Position> mPositions;
759 
DictionaryIterator(ArrayList<CharGroup> root)760         public DictionaryIterator(ArrayList<CharGroup> root) {
761             mCurrentString = new StringBuilder();
762             mPositions = new LinkedList<Position>();
763             final Position rootPos = new Position(root);
764             mPositions.add(rootPos);
765         }
766 
767         @Override
hasNext()768         public boolean hasNext() {
769             for (Position p : mPositions) {
770                 if (p.pos.hasNext()) {
771                     return true;
772                 }
773             }
774             return false;
775         }
776 
777         @Override
next()778         public Word next() {
779             Position currentPos = mPositions.getLast();
780             mCurrentString.setLength(mCurrentString.length() - currentPos.length);
781 
782             do {
783                 if (currentPos.pos.hasNext()) {
784                     final CharGroup currentGroup = currentPos.pos.next();
785                     currentPos.length = currentGroup.mChars.length;
786                     for (int i : currentGroup.mChars)
787                         mCurrentString.append(Character.toChars(i));
788                     if (null != currentGroup.mChildren) {
789                         currentPos = new Position(currentGroup.mChildren.mData);
790                         mPositions.addLast(currentPos);
791                     }
792                     if (currentGroup.mFrequency >= 0)
793                         return new Word(mCurrentString.toString(), currentGroup.mFrequency,
794                                 currentGroup.mShortcutTargets, currentGroup.mBigrams,
795                                 currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry);
796                 } else {
797                     mPositions.removeLast();
798                     currentPos = mPositions.getLast();
799                     mCurrentString.setLength(mCurrentString.length() - mPositions.getLast().length);
800                 }
801             } while (true);
802         }
803 
804         @Override
remove()805         public void remove() {
806             throw new UnsupportedOperationException("Unsupported yet");
807         }
808 
809     }
810 
811     /**
812      * Method to return an iterator.
813      *
814      * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x
815      * and say : for (Word w : x) {}
816      */
817     @Override
iterator()818     public Iterator<Word> iterator() {
819         return new DictionaryIterator(mRoot.mData);
820     }
821 }
822