• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2011 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.inputmethod.latin.makedict;
18 
19 import com.android.inputmethod.annotations.UsedForTesting;
20 import com.android.inputmethod.latin.Constants;
21 
22 import java.util.ArrayList;
23 import java.util.Arrays;
24 import java.util.Collections;
25 import java.util.Date;
26 import java.util.HashMap;
27 import java.util.Iterator;
28 import java.util.LinkedList;
29 
30 /**
31  * A dictionary that can fusion heads and tails of words for more compression.
32  */
33 @UsedForTesting
34 public final class FusionDictionary implements Iterable<Word> {
35     private static final boolean DBG = MakedictLog.DBG;
36 
37     private static int CHARACTER_NOT_FOUND_INDEX = -1;
38 
39     /**
40      * A node array of the dictionary, containing several PtNodes.
41      *
42      * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the
43      * real information.
44      * This class also contains fields to cache size and address, to help with binary
45      * generation.
46      */
47     public static final class PtNodeArray {
48         ArrayList<PtNode> mData;
49         // To help with binary generation
50         int mCachedSize = Integer.MIN_VALUE;
51         // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They
52         // always hold the same value except between dictionary address compression, during which
53         // the update process needs to know about both values at the same time. Updating will
54         // update the AfterUpdate value, and the code will move them to BeforeUpdate before
55         // the next update pass.
56         int mCachedAddressBeforeUpdate = Integer.MIN_VALUE;
57         int mCachedAddressAfterUpdate = Integer.MIN_VALUE;
58         int mCachedParentAddress = 0;
59 
PtNodeArray()60         public PtNodeArray() {
61             mData = new ArrayList<PtNode>();
62         }
PtNodeArray(ArrayList<PtNode> data)63         public PtNodeArray(ArrayList<PtNode> data) {
64             mData = data;
65         }
66     }
67 
68     /**
69      * A string with a frequency.
70      *
71      * This represents an "attribute", that is either a bigram or a shortcut.
72      */
73     public static final class WeightedString {
74         public final String mWord;
75         public int mFrequency;
WeightedString(String word, int frequency)76         public WeightedString(String word, int frequency) {
77             mWord = word;
78             mFrequency = frequency;
79         }
80 
81         @Override
hashCode()82         public int hashCode() {
83             return Arrays.hashCode(new Object[] { mWord, mFrequency });
84         }
85 
86         @Override
equals(Object o)87         public boolean equals(Object o) {
88             if (o == this) return true;
89             if (!(o instanceof WeightedString)) return false;
90             WeightedString w = (WeightedString)o;
91             return mWord.equals(w.mWord) && mFrequency == w.mFrequency;
92         }
93     }
94 
95     /**
96      * PtNode is a group of characters, with a frequency, shortcut targets, bigrams, and children
97      * (Pt means Patricia Trie).
98      *
99      * This is the central class of the in-memory representation. A PtNode is what can
100      * be seen as a traditional "trie node", except it can hold several characters at the
101      * same time. A PtNode essentially represents one or several characters in the middle
102      * of the trie tree; as such, it can be a terminal, and it can have children.
103      * In this in-memory representation, whether the PtNode is a terminal or not is represented
104      * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other
105      * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or
106      * bigrams, but a non-terminal may not. Moreover, children, if present, are null.
107      */
108     public static final class PtNode {
109         public static final int NOT_A_TERMINAL = -1;
110         final int mChars[];
111         ArrayList<WeightedString> mShortcutTargets;
112         ArrayList<WeightedString> mBigrams;
113         int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal.
114         int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal.
115         PtNodeArray mChildren;
116         boolean mIsNotAWord; // Only a shortcut
117         boolean mIsBlacklistEntry;
118         // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary
119         // generation. Before and After always hold the same value except during dictionary
120         // address compression, where the update process needs to know about both values at the
121         // same time. Updating will update the AfterUpdate value, and the code will move them
122         // to BeforeUpdate before the next update pass.
123         // The update process does not need two versions of mCachedSize.
124         int mCachedSize; // The size, in bytes, of this PtNode.
125         int mCachedAddressBeforeUpdate; // The address of this PtNode (before update)
126         int mCachedAddressAfterUpdate; // The address of this PtNode (after update)
127 
PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final int frequency, final boolean isNotAWord, final boolean isBlacklistEntry)128         public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
129                 final ArrayList<WeightedString> bigrams, final int frequency,
130                 final boolean isNotAWord, final boolean isBlacklistEntry) {
131             mChars = chars;
132             mFrequency = frequency;
133             mTerminalId = frequency;
134             mShortcutTargets = shortcutTargets;
135             mBigrams = bigrams;
136             mChildren = null;
137             mIsNotAWord = isNotAWord;
138             mIsBlacklistEntry = isBlacklistEntry;
139         }
140 
PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final int frequency, final boolean isNotAWord, final boolean isBlacklistEntry, final PtNodeArray children)141         public PtNode(final int[] chars, final ArrayList<WeightedString> shortcutTargets,
142                 final ArrayList<WeightedString> bigrams, final int frequency,
143                 final boolean isNotAWord, final boolean isBlacklistEntry,
144                 final PtNodeArray children) {
145             mChars = chars;
146             mFrequency = frequency;
147             mShortcutTargets = shortcutTargets;
148             mBigrams = bigrams;
149             mChildren = children;
150             mIsNotAWord = isNotAWord;
151             mIsBlacklistEntry = isBlacklistEntry;
152         }
153 
addChild(PtNode n)154         public void addChild(PtNode n) {
155             if (null == mChildren) {
156                 mChildren = new PtNodeArray();
157             }
158             mChildren.mData.add(n);
159         }
160 
getTerminalId()161         public int getTerminalId() {
162             return mTerminalId;
163         }
164 
isTerminal()165         public boolean isTerminal() {
166             return NOT_A_TERMINAL != mFrequency;
167         }
168 
getFrequency()169         public int getFrequency() {
170             return mFrequency;
171         }
172 
getIsNotAWord()173         public boolean getIsNotAWord() {
174             return mIsNotAWord;
175         }
176 
getIsBlacklistEntry()177         public boolean getIsBlacklistEntry() {
178             return mIsBlacklistEntry;
179         }
180 
getShortcutTargets()181         public ArrayList<WeightedString> getShortcutTargets() {
182             // We don't want write permission to escape outside the package, so we return a copy
183             if (null == mShortcutTargets) return null;
184             final ArrayList<WeightedString> copyOfShortcutTargets =
185                     new ArrayList<WeightedString>(mShortcutTargets);
186             return copyOfShortcutTargets;
187         }
188 
getBigrams()189         public ArrayList<WeightedString> getBigrams() {
190             // We don't want write permission to escape outside the package, so we return a copy
191             if (null == mBigrams) return null;
192             final ArrayList<WeightedString> copyOfBigrams = new ArrayList<WeightedString>(mBigrams);
193             return copyOfBigrams;
194         }
195 
hasSeveralChars()196         public boolean hasSeveralChars() {
197             assert(mChars.length > 0);
198             return 1 < mChars.length;
199         }
200 
201         /**
202          * Adds a word to the bigram list. Updates the frequency if the word already
203          * exists.
204          */
addBigram(final String word, final int frequency)205         public void addBigram(final String word, final int frequency) {
206             if (mBigrams == null) {
207                 mBigrams = new ArrayList<WeightedString>();
208             }
209             WeightedString bigram = getBigram(word);
210             if (bigram != null) {
211                 bigram.mFrequency = frequency;
212             } else {
213                 bigram = new WeightedString(word, frequency);
214                 mBigrams.add(bigram);
215             }
216         }
217 
218         /**
219          * Gets the shortcut target for the given word. Returns null if the word is not in the
220          * shortcut list.
221          */
getShortcut(final String word)222         public WeightedString getShortcut(final String word) {
223             // TODO: Don't do a linear search
224             if (mShortcutTargets != null) {
225                 final int size = mShortcutTargets.size();
226                 for (int i = 0; i < size; ++i) {
227                     WeightedString shortcut = mShortcutTargets.get(i);
228                     if (shortcut.mWord.equals(word)) {
229                         return shortcut;
230                     }
231                 }
232             }
233             return null;
234         }
235 
236         /**
237          * Gets the bigram for the given word.
238          * Returns null if the word is not in the bigrams list.
239          */
getBigram(final String word)240         public WeightedString getBigram(final String word) {
241             // TODO: Don't do a linear search
242             if (mBigrams != null) {
243                 final int size = mBigrams.size();
244                 for (int i = 0; i < size; ++i) {
245                     WeightedString bigram = mBigrams.get(i);
246                     if (bigram.mWord.equals(word)) {
247                         return bigram;
248                     }
249                 }
250             }
251             return null;
252         }
253 
254         /**
255          * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to
256          * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only
257          * updated if they are higher than the existing ones.
258          */
update(final int frequency, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final boolean isNotAWord, final boolean isBlacklistEntry)259         public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets,
260                 final ArrayList<WeightedString> bigrams,
261                 final boolean isNotAWord, final boolean isBlacklistEntry) {
262             if (frequency > mFrequency) {
263                 mFrequency = frequency;
264             }
265             if (shortcutTargets != null) {
266                 if (mShortcutTargets == null) {
267                     mShortcutTargets = shortcutTargets;
268                 } else {
269                     final int size = shortcutTargets.size();
270                     for (int i = 0; i < size; ++i) {
271                         final WeightedString shortcut = shortcutTargets.get(i);
272                         final WeightedString existingShortcut = getShortcut(shortcut.mWord);
273                         if (existingShortcut == null) {
274                             mShortcutTargets.add(shortcut);
275                         } else if (existingShortcut.mFrequency < shortcut.mFrequency) {
276                             existingShortcut.mFrequency = shortcut.mFrequency;
277                         }
278                     }
279                 }
280             }
281             if (bigrams != null) {
282                 if (mBigrams == null) {
283                     mBigrams = bigrams;
284                 } else {
285                     final int size = bigrams.size();
286                     for (int i = 0; i < size; ++i) {
287                         final WeightedString bigram = bigrams.get(i);
288                         final WeightedString existingBigram = getBigram(bigram.mWord);
289                         if (existingBigram == null) {
290                             mBigrams.add(bigram);
291                         } else if (existingBigram.mFrequency < bigram.mFrequency) {
292                             existingBigram.mFrequency = bigram.mFrequency;
293                         }
294                     }
295                 }
296             }
297             mIsNotAWord = isNotAWord;
298             mIsBlacklistEntry = isBlacklistEntry;
299         }
300     }
301 
302     /**
303      * Options global to the dictionary.
304      */
305     public static final class DictionaryOptions {
306         public final boolean mGermanUmlautProcessing;
307         public final boolean mFrenchLigatureProcessing;
308         public final HashMap<String, String> mAttributes;
DictionaryOptions(final HashMap<String, String> attributes, final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing)309         public DictionaryOptions(final HashMap<String, String> attributes,
310                 final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) {
311             mAttributes = attributes;
312             mGermanUmlautProcessing = germanUmlautProcessing;
313             mFrenchLigatureProcessing = frenchLigatureProcessing;
314         }
315         @Override
toString()316         public String toString() { // Convenience method
317             return toString(0, false);
318         }
toString(final int indentCount, final boolean plumbing)319         public String toString(final int indentCount, final boolean plumbing) {
320             final StringBuilder indent = new StringBuilder();
321             if (plumbing) {
322                 indent.append("H:");
323             } else {
324                 for (int i = 0; i < indentCount; ++i) {
325                     indent.append(" ");
326                 }
327             }
328             final StringBuilder s = new StringBuilder();
329             for (final String optionKey : mAttributes.keySet()) {
330                 s.append(indent);
331                 s.append(optionKey);
332                 s.append(" = ");
333                 if ("date".equals(optionKey) && !plumbing) {
334                     // Date needs a number of milliseconds, but the dictionary contains seconds
335                     s.append(new Date(
336                             1000 * Long.parseLong(mAttributes.get(optionKey))).toString());
337                 } else {
338                     s.append(mAttributes.get(optionKey));
339                 }
340                 s.append("\n");
341             }
342             if (mGermanUmlautProcessing) {
343                 s.append(indent);
344                 s.append("Needs German umlaut processing\n");
345             }
346             if (mFrenchLigatureProcessing) {
347                 s.append(indent);
348                 s.append("Needs French ligature processing\n");
349             }
350             return s.toString();
351         }
352     }
353 
354     public final DictionaryOptions mOptions;
355     public final PtNodeArray mRootNodeArray;
356 
FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options)357     public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) {
358         mRootNodeArray = rootNodeArray;
359         mOptions = options;
360     }
361 
addOptionAttribute(final String key, final String value)362     public void addOptionAttribute(final String key, final String value) {
363         mOptions.mAttributes.put(key, value);
364     }
365 
366     /**
367      * Helper method to convert a String to an int array.
368      */
getCodePoints(final String word)369     static int[] getCodePoints(final String word) {
370         // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray,
371         // which is not visible from the makedict package. Factor this code.
372         final int length = word.length();
373         if (length <= 0) return new int[] {};
374         final char[] characters = word.toCharArray();
375         final int[] codePoints = new int[Character.codePointCount(characters, 0, length)];
376         int codePoint = Character.codePointAt(characters, 0);
377         int dsti = 0;
378         for (int srci = Character.charCount(codePoint);
379                 srci < length; srci += Character.charCount(codePoint), ++dsti) {
380             codePoints[dsti] = codePoint;
381             codePoint = Character.codePointAt(characters, srci);
382         }
383         codePoints[dsti] = codePoint;
384         return codePoints;
385     }
386 
387     /**
388      * Helper method to add a word as a string.
389      *
390      * This method adds a word to the dictionary with the given frequency. Optional
391      * lists of bigrams and shortcuts can be passed here. For each word inside,
392      * they will be added to the dictionary as necessary.
393      *
394      * @param word the word to add.
395      * @param frequency the frequency of the word, in the range [0..255].
396      * @param shortcutTargets a list of shortcut targets for this word, or null.
397      * @param isNotAWord true if this should not be considered a word (e.g. shortcut only)
398      */
add(final String word, final int frequency, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord)399     public void add(final String word, final int frequency,
400             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
401         add(getCodePoints(word), frequency, shortcutTargets, isNotAWord,
402                 false /* isBlacklistEntry */);
403     }
404 
405     /**
406      * Helper method to add a blacklist entry as a string.
407      *
408      * @param word the word to add as a blacklist entry.
409      * @param shortcutTargets a list of shortcut targets for this word, or null.
410      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
411      */
addBlacklistEntry(final String word, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord)412     public void addBlacklistEntry(final String word,
413             final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) {
414         add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */);
415     }
416 
417     /**
418      * Sanity check for a PtNode array.
419      *
420      * This method checks that all PtNodes in a node array are ordered as expected.
421      * If they are, nothing happens. If they aren't, an exception is thrown.
422      */
checkStack(PtNodeArray ptNodeArray)423     private void checkStack(PtNodeArray ptNodeArray) {
424         ArrayList<PtNode> stack = ptNodeArray.mData;
425         int lastValue = -1;
426         for (int i = 0; i < stack.size(); ++i) {
427             int currentValue = stack.get(i).mChars[0];
428             if (currentValue <= lastValue)
429                 throw new RuntimeException("Invalid stack");
430             else
431                 lastValue = currentValue;
432         }
433     }
434 
435     /**
436      * Helper method to add a new bigram to the dictionary.
437      *
438      * @param word1 the previous word of the context
439      * @param word2 the next word of the context
440      * @param frequency the bigram frequency
441      */
setBigram(final String word1, final String word2, final int frequency)442     public void setBigram(final String word1, final String word2, final int frequency) {
443         PtNode ptNode = findWordInTree(mRootNodeArray, word1);
444         if (ptNode != null) {
445             final PtNode ptNode2 = findWordInTree(mRootNodeArray, word2);
446             if (ptNode2 == null) {
447                 add(getCodePoints(word2), 0, null, false /* isNotAWord */,
448                         false /* isBlacklistEntry */);
449                 // The PtNode for the first word may have moved by the above insertion,
450                 // if word1 and word2 share a common stem that happens not to have been
451                 // a cutting point until now. In this case, we need to refresh ptNode.
452                 ptNode = findWordInTree(mRootNodeArray, word1);
453             }
454             ptNode.addBigram(word2, frequency);
455         } else {
456             throw new RuntimeException("First word of bigram not found");
457         }
458     }
459 
460     /**
461      * Add a word to this dictionary.
462      *
463      * The shortcuts, if any, have to be in the dictionary already. If they aren't,
464      * an exception is thrown.
465      *
466      * @param word the word, as an int array.
467      * @param frequency the frequency of the word, in the range [0..255].
468      * @param shortcutTargets an optional list of shortcut targets for this word (null if none).
469      * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so)
470      * @param isBlacklistEntry true if this is a blacklisted word, false otherwise
471      */
add(final int[] word, final int frequency, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord, final boolean isBlacklistEntry)472     private void add(final int[] word, final int frequency,
473             final ArrayList<WeightedString> shortcutTargets,
474             final boolean isNotAWord, final boolean isBlacklistEntry) {
475         assert(frequency >= 0 && frequency <= 255);
476         if (word.length >= Constants.DICTIONARY_MAX_WORD_LENGTH) {
477             MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length);
478             return;
479         }
480 
481         PtNodeArray currentNodeArray = mRootNodeArray;
482         int charIndex = 0;
483 
484         PtNode currentPtNode = null;
485         int differentCharIndex = 0; // Set by the loop to the index of the char that differs
486         int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]);
487         while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) {
488             currentPtNode = currentNodeArray.mData.get(nodeIndex);
489             differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex);
490             if (ARRAYS_ARE_EQUAL != differentCharIndex
491                     && differentCharIndex < currentPtNode.mChars.length) break;
492             if (null == currentPtNode.mChildren) break;
493             charIndex += currentPtNode.mChars.length;
494             if (charIndex >= word.length) break;
495             currentNodeArray = currentPtNode.mChildren;
496             nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]);
497         }
498 
499         if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) {
500             // No node at this point to accept the word. Create one.
501             final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]);
502             final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length),
503                     shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry);
504             currentNodeArray.mData.add(insertionIndex, newPtNode);
505             if (DBG) checkStack(currentNodeArray);
506         } else {
507             // There is a word with a common prefix.
508             if (differentCharIndex == currentPtNode.mChars.length) {
509                 if (charIndex + differentCharIndex >= word.length) {
510                     // The new word is a prefix of an existing word, but the node on which it
511                     // should end already exists as is. Since the old PtNode was not a terminal,
512                     // make it one by filling in its frequency and other attributes
513                     currentPtNode.update(frequency, shortcutTargets, null, isNotAWord,
514                             isBlacklistEntry);
515                 } else {
516                     // The new word matches the full old word and extends past it.
517                     // We only have to create a new node and add it to the end of this.
518                     final PtNode newNode = new PtNode(
519                             Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length),
520                                     shortcutTargets, null /* bigrams */, frequency, isNotAWord,
521                                     isBlacklistEntry);
522                     currentPtNode.mChildren = new PtNodeArray();
523                     currentPtNode.mChildren.mData.add(newNode);
524                 }
525             } else {
526                 if (0 == differentCharIndex) {
527                     // Exact same word. Update the frequency if higher. This will also add the
528                     // new shortcuts to the existing shortcut list if it already exists.
529                     currentPtNode.update(frequency, shortcutTargets, null,
530                             currentPtNode.mIsNotAWord && isNotAWord,
531                             currentPtNode.mIsBlacklistEntry || isBlacklistEntry);
532                 } else {
533                     // Partial prefix match only. We have to replace the current node with a node
534                     // containing the current prefix and create two new ones for the tails.
535                     PtNodeArray newChildren = new PtNodeArray();
536                     final PtNode newOldWord = new PtNode(
537                             Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex,
538                                     currentPtNode.mChars.length), currentPtNode.mShortcutTargets,
539                             currentPtNode.mBigrams, currentPtNode.mFrequency,
540                             currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry,
541                             currentPtNode.mChildren);
542                     newChildren.mData.add(newOldWord);
543 
544                     final PtNode newParent;
545                     if (charIndex + differentCharIndex >= word.length) {
546                         newParent = new PtNode(
547                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
548                                 shortcutTargets, null /* bigrams */, frequency,
549                                 isNotAWord, isBlacklistEntry, newChildren);
550                     } else {
551                         newParent = new PtNode(
552                                 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex),
553                                 null /* shortcutTargets */, null /* bigrams */, -1,
554                                 false /* isNotAWord */, false /* isBlacklistEntry */, newChildren);
555                         final PtNode newWord = new PtNode(Arrays.copyOfRange(word,
556                                 charIndex + differentCharIndex, word.length),
557                                 shortcutTargets, null /* bigrams */, frequency,
558                                 isNotAWord, isBlacklistEntry);
559                         final int addIndex = word[charIndex + differentCharIndex]
560                                 > currentPtNode.mChars[differentCharIndex] ? 1 : 0;
561                         newChildren.mData.add(addIndex, newWord);
562                     }
563                     currentNodeArray.mData.set(nodeIndex, newParent);
564                 }
565                 if (DBG) checkStack(currentNodeArray);
566             }
567         }
568     }
569 
570     private static int ARRAYS_ARE_EQUAL = 0;
571 
572     /**
573      * Custom comparison of two int arrays taken to contain character codes.
574      *
575      * This method compares the two arrays passed as an argument in a lexicographic way,
576      * with an offset in the dst string.
577      * This method does NOT test for the first character. It is taken to be equal.
578      * I repeat: this method starts the comparison at 1 <> dstOffset + 1.
579      * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the
580      * strings are equal. This works BECAUSE we don't look at the first character.
581      *
582      * @param src the left-hand side string of the comparison.
583      * @param dst the right-hand side string of the comparison.
584      * @param dstOffset the offset in the right-hand side string.
585      * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't.
586      */
compareCharArrays(final int[] src, final int[] dst, int dstOffset)587     private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) {
588         // We do NOT test the first char, because we come from a method that already
589         // tested it.
590         for (int i = 1; i < src.length; ++i) {
591             if (dstOffset + i >= dst.length) return i;
592             if (src[i] != dst[dstOffset + i]) return i;
593         }
594         if (dst.length > src.length) return src.length;
595         return ARRAYS_ARE_EQUAL;
596     }
597 
598     /**
599      * Helper class that compares and sorts two PtNodes according to their
600      * first element only. I repeat: ONLY the first element is considered, the rest
601      * is ignored.
602      * This comparator imposes orderings that are inconsistent with equals.
603      */
604     static private final class PtNodeComparator implements java.util.Comparator<PtNode> {
605         @Override
compare(PtNode p1, PtNode p2)606         public int compare(PtNode p1, PtNode p2) {
607             if (p1.mChars[0] == p2.mChars[0]) return 0;
608             return p1.mChars[0] < p2.mChars[0] ? -1 : 1;
609         }
610     }
611     final static private PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator();
612 
613     /**
614      * Finds the insertion index of a character within a node array.
615      */
findInsertionIndex(final PtNodeArray nodeArray, int character)616     private static int findInsertionIndex(final PtNodeArray nodeArray, int character) {
617         final ArrayList<PtNode> data = nodeArray.mData;
618         final PtNode reference = new PtNode(new int[] { character },
619                 null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */,
620                 false /* isBlacklistEntry */);
621         int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR);
622         return result >= 0 ? result : -result - 1;
623     }
624 
625     /**
626      * Find the index of a char in a node array, if it exists.
627      *
628      * @param nodeArray the node array to search in.
629      * @param character the character to search for.
630      * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else.
631      */
findIndexOfChar(final PtNodeArray nodeArray, int character)632     private static int findIndexOfChar(final PtNodeArray nodeArray, int character) {
633         final int insertionIndex = findInsertionIndex(nodeArray, character);
634         if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX;
635         return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex
636                 : CHARACTER_NOT_FOUND_INDEX;
637     }
638 
639     /**
640      * Helper method to find a word in a given branch.
641      */
642     @SuppressWarnings("unused")
findWordInTree(PtNodeArray nodeArray, final String string)643     public static PtNode findWordInTree(PtNodeArray nodeArray, final String string) {
644         int index = 0;
645         final StringBuilder checker = DBG ? new StringBuilder() : null;
646         final int[] codePoints = getCodePoints(string);
647 
648         PtNode currentPtNode;
649         do {
650             int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]);
651             if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null;
652             currentPtNode = nodeArray.mData.get(indexOfGroup);
653 
654             if (codePoints.length - index < currentPtNode.mChars.length) return null;
655             int newIndex = index;
656             while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) {
657                 if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null;
658                 newIndex++;
659             }
660             index = newIndex;
661 
662             if (DBG) {
663                 checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length));
664             }
665             if (index < codePoints.length) {
666                 nodeArray = currentPtNode.mChildren;
667             }
668         } while (null != nodeArray && index < codePoints.length);
669 
670         if (index < codePoints.length) return null;
671         if (!currentPtNode.isTerminal()) return null;
672         if (DBG && !string.equals(checker.toString())) return null;
673         return currentPtNode;
674     }
675 
676     /**
677      * Helper method to find out whether a word is in the dict or not.
678      */
hasWord(final String s)679     public boolean hasWord(final String s) {
680         if (null == s || "".equals(s)) {
681             throw new RuntimeException("Can't search for a null or empty string");
682         }
683         return null != findWordInTree(mRootNodeArray, s);
684     }
685 
686     /**
687      * Recursively count the number of PtNodes in a given branch of the trie.
688      *
689      * @param nodeArray the parent node.
690      * @return the number of PtNodes in all the branch under this node.
691      */
countPtNodes(final PtNodeArray nodeArray)692     public static int countPtNodes(final PtNodeArray nodeArray) {
693         final int nodeSize = nodeArray.mData.size();
694         int size = nodeSize;
695         for (int i = nodeSize - 1; i >= 0; --i) {
696             PtNode ptNode = nodeArray.mData.get(i);
697             if (null != ptNode.mChildren)
698                 size += countPtNodes(ptNode.mChildren);
699         }
700         return size;
701     }
702 
703     /**
704      * Recursively count the number of nodes in a given branch of the trie.
705      *
706      * @param nodeArray the node array to count.
707      * @return the number of nodes in this branch.
708      */
countNodeArrays(final PtNodeArray nodeArray)709     public static int countNodeArrays(final PtNodeArray nodeArray) {
710         int size = 1;
711         for (int i = nodeArray.mData.size() - 1; i >= 0; --i) {
712             PtNode ptNode = nodeArray.mData.get(i);
713             if (null != ptNode.mChildren)
714                 size += countNodeArrays(ptNode.mChildren);
715         }
716         return size;
717     }
718 
719     // Recursively find out whether there are any bigrams.
720     // This can be pretty expensive especially if there aren't any (we return as soon
721     // as we find one, so it's much cheaper if there are bigrams)
hasBigramsInternal(final PtNodeArray nodeArray)722     private static boolean hasBigramsInternal(final PtNodeArray nodeArray) {
723         if (null == nodeArray) return false;
724         for (int i = nodeArray.mData.size() - 1; i >= 0; --i) {
725             PtNode ptNode = nodeArray.mData.get(i);
726             if (null != ptNode.mBigrams) return true;
727             if (hasBigramsInternal(ptNode.mChildren)) return true;
728         }
729         return false;
730     }
731 
732     /**
733      * Finds out whether there are any bigrams in this dictionary.
734      *
735      * @return true if there is any bigram, false otherwise.
736      */
737     // TODO: this is expensive especially for large dictionaries without any bigram.
738     // The up side is, this is always accurate and correct and uses no memory. We should
739     // find a more efficient way of doing this, without compromising too much on memory
740     // and ease of use.
hasBigrams()741     public boolean hasBigrams() {
742         return hasBigramsInternal(mRootNodeArray);
743     }
744 
745     // Historically, the tails of the words were going to be merged to save space.
746     // However, that would prevent the code to search for a specific address in log(n)
747     // time so this was abandoned.
748     // The code is still of interest as it does add some compression to any dictionary
749     // that has no need for attributes. Implementations that does not read attributes should be
750     // able to read a dictionary with merged tails.
751     // Also, the following code does support frequencies, as in, it will only merges
752     // tails that share the same frequency. Though it would result in the above loss of
753     // performance while searching by address, it is still technically possible to merge
754     // tails that contain attributes, but this code does not take that into account - it does
755     // not compare attributes and will merge terminals with different attributes regardless.
mergeTails()756     public void mergeTails() {
757         MakedictLog.i("Do not merge tails");
758         return;
759 
760 //        MakedictLog.i("Merging PtNodes. Number of PtNodes : " + countPtNodes(root));
761 //        MakedictLog.i("Number of PtNodes : " + countPtNodes(root));
762 //
763 //        final HashMap<String, ArrayList<PtNodeArray>> repository =
764 //                  new HashMap<String, ArrayList<PtNodeArray>>();
765 //        mergeTailsInner(repository, root);
766 //
767 //        MakedictLog.i("Number of different pseudohashes : " + repository.size());
768 //        int size = 0;
769 //        for (ArrayList<PtNodeArray> a : repository.values()) {
770 //            size += a.size();
771 //        }
772 //        MakedictLog.i("Number of nodes after merge : " + (1 + size));
773 //        MakedictLog.i("Recursively seen nodes : " + countNodes(root));
774     }
775 
776     // The following methods are used by the deactivated mergeTails()
777 //   private static boolean isEqual(PtNodeArray a, PtNodeArray b) {
778 //       if (null == a && null == b) return true;
779 //       if (null == a || null == b) return false;
780 //       if (a.data.size() != b.data.size()) return false;
781 //       final int size = a.data.size();
782 //       for (int i = size - 1; i >= 0; --i) {
783 //           PtNode aPtNode = a.data.get(i);
784 //           PtNode bPtNode = b.data.get(i);
785 //           if (aPtNode.frequency != bPtNode.frequency) return false;
786 //           if (aPtNode.alternates == null && bPtNode.alternates != null) return false;
787 //           if (aPtNode.alternates != null && !aPtNode.equals(bPtNode.alternates)) return false;
788 //           if (!Arrays.equals(aPtNode.chars, bPtNode.chars)) return false;
789 //           if (!isEqual(aPtNode.children, bPtNode.children)) return false;
790 //       }
791 //       return true;
792 //   }
793 
794 //   static private HashMap<String, ArrayList<PtNodeArray>> mergeTailsInner(
795 //           final HashMap<String, ArrayList<PtNodeArray>> map, final PtNodeArray nodeArray) {
796 //       final ArrayList<PtNode> branches = nodeArray.data;
797 //       final int nodeSize = branches.size();
798 //       for (int i = 0; i < nodeSize; ++i) {
799 //           PtNode ptNode = branches.get(i);
800 //           if (null != ptNode.children) {
801 //               String pseudoHash = getPseudoHash(ptNode.children);
802 //               ArrayList<PtNodeArray> similarList = map.get(pseudoHash);
803 //               if (null == similarList) {
804 //                   similarList = new ArrayList<PtNodeArray>();
805 //                   map.put(pseudoHash, similarList);
806 //               }
807 //               boolean merged = false;
808 //               for (PtNodeArray similar : similarList) {
809 //                   if (isEqual(ptNode.children, similar)) {
810 //                       ptNode.children = similar;
811 //                       merged = true;
812 //                       break;
813 //                   }
814 //               }
815 //               if (!merged) {
816 //                   similarList.add(ptNode.children);
817 //               }
818 //               mergeTailsInner(map, ptNode.children);
819 //           }
820 //       }
821 //       return map;
822 //   }
823 
824 //  private static String getPseudoHash(final PtNodeArray nodeArray) {
825 //      StringBuilder s = new StringBuilder();
826 //      for (PtNode ptNode : nodeArray.data) {
827 //          s.append(ptNode.frequency);
828 //          for (int ch : ptNode.chars) {
829 //              s.append(Character.toChars(ch));
830 //          }
831 //      }
832 //      return s.toString();
833 //  }
834 
835     /**
836      * Iterator to walk through a dictionary.
837      *
838      * This is purely for convenience.
839      */
840     public static final class DictionaryIterator implements Iterator<Word> {
841         private static final class Position {
842             public Iterator<PtNode> pos;
843             public int length;
Position(ArrayList<PtNode> ptNodes)844             public Position(ArrayList<PtNode> ptNodes) {
845                 pos = ptNodes.iterator();
846                 length = 0;
847             }
848         }
849         final StringBuilder mCurrentString;
850         final LinkedList<Position> mPositions;
851 
DictionaryIterator(ArrayList<PtNode> ptRoot)852         public DictionaryIterator(ArrayList<PtNode> ptRoot) {
853             mCurrentString = new StringBuilder();
854             mPositions = new LinkedList<Position>();
855             final Position rootPos = new Position(ptRoot);
856             mPositions.add(rootPos);
857         }
858 
859         @Override
hasNext()860         public boolean hasNext() {
861             for (Position p : mPositions) {
862                 if (p.pos.hasNext()) {
863                     return true;
864                 }
865             }
866             return false;
867         }
868 
869         @Override
next()870         public Word next() {
871             Position currentPos = mPositions.getLast();
872             mCurrentString.setLength(currentPos.length);
873 
874             do {
875                 if (currentPos.pos.hasNext()) {
876                     final PtNode currentPtNode = currentPos.pos.next();
877                     currentPos.length = mCurrentString.length();
878                     for (int i : currentPtNode.mChars) {
879                         mCurrentString.append(Character.toChars(i));
880                     }
881                     if (null != currentPtNode.mChildren) {
882                         currentPos = new Position(currentPtNode.mChildren.mData);
883                         currentPos.length = mCurrentString.length();
884                         mPositions.addLast(currentPos);
885                     }
886                     if (currentPtNode.mFrequency >= 0) {
887                         return new Word(mCurrentString.toString(), currentPtNode.mFrequency,
888                                 currentPtNode.mShortcutTargets, currentPtNode.mBigrams,
889                                 currentPtNode.mIsNotAWord, currentPtNode.mIsBlacklistEntry);
890                     }
891                 } else {
892                     mPositions.removeLast();
893                     currentPos = mPositions.getLast();
894                     mCurrentString.setLength(mPositions.getLast().length);
895                 }
896             } while (true);
897         }
898 
899         @Override
remove()900         public void remove() {
901             throw new UnsupportedOperationException("Unsupported yet");
902         }
903 
904     }
905 
906     /**
907      * Method to return an iterator.
908      *
909      * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x
910      * and say : for (Word w : x) {}
911      */
912     @Override
iterator()913     public Iterator<Word> iterator() {
914         return new DictionaryIterator(mRootNodeArray.mData);
915     }
916 }
917