1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not 5 * use this file except in compliance with the License. You may obtain a copy of 6 * the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 12 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 13 * License for the specific language governing permissions and limitations under 14 * the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.latin.Constants; 20 21 import java.util.ArrayList; 22 import java.util.Arrays; 23 import java.util.Collections; 24 import java.util.HashMap; 25 import java.util.Iterator; 26 import java.util.LinkedList; 27 28 /** 29 * A dictionary that can fusion heads and tails of words for more compression. 30 */ 31 public final class FusionDictionary implements Iterable<Word> { 32 private static final boolean DBG = MakedictLog.DBG; 33 34 /** 35 * A node of the dictionary, containing several CharGroups. 36 * 37 * A node is but an ordered array of CharGroups, which essentially contain all the 38 * real information. 39 * This class also contains fields to cache size and address, to help with binary 40 * generation. 41 */ 42 public static final class Node { 43 ArrayList<CharGroup> mData; 44 // To help with binary generation 45 int mCachedSize = Integer.MIN_VALUE; 46 int mCachedAddress = Integer.MIN_VALUE; 47 int mCachedParentAddress = 0; 48 Node()49 public Node() { 50 mData = new ArrayList<CharGroup>(); 51 } Node(ArrayList<CharGroup> data)52 public Node(ArrayList<CharGroup> data) { 53 mData = data; 54 } 55 } 56 57 /** 58 * A string with a frequency. 59 * 60 * This represents an "attribute", that is either a bigram or a shortcut. 61 */ 62 public static final class WeightedString { 63 public final String mWord; 64 public int mFrequency; WeightedString(String word, int frequency)65 public WeightedString(String word, int frequency) { 66 mWord = word; 67 mFrequency = frequency; 68 } 69 70 @Override hashCode()71 public int hashCode() { 72 return Arrays.hashCode(new Object[] { mWord, mFrequency }); 73 } 74 75 @Override equals(Object o)76 public boolean equals(Object o) { 77 if (o == this) return true; 78 if (!(o instanceof WeightedString)) return false; 79 WeightedString w = (WeightedString)o; 80 return mWord.equals(w.mWord) && mFrequency == w.mFrequency; 81 } 82 } 83 84 /** 85 * A group of characters, with a frequency, shortcut targets, bigrams, and children. 86 * 87 * This is the central class of the in-memory representation. A CharGroup is what can 88 * be seen as a traditional "trie node", except it can hold several characters at the 89 * same time. A CharGroup essentially represents one or several characters in the middle 90 * of the trie trie; as such, it can be a terminal, and it can have children. 91 * In this in-memory representation, whether the CharGroup is a terminal or not is represented 92 * in the frequency, where NOT_A_TERMINAL (= -1) means this is not a terminal and any other 93 * value is the frequency of this terminal. A terminal may have non-null shortcuts and/or 94 * bigrams, but a non-terminal may not. Moreover, children, if present, are null. 95 */ 96 public static final class CharGroup { 97 public static final int NOT_A_TERMINAL = -1; 98 final int mChars[]; 99 ArrayList<WeightedString> mShortcutTargets; 100 ArrayList<WeightedString> mBigrams; 101 int mFrequency; // NOT_A_TERMINAL == mFrequency indicates this is not a terminal. 102 Node mChildren; 103 boolean mIsNotAWord; // Only a shortcut 104 boolean mIsBlacklistEntry; 105 // The two following members to help with binary generation 106 int mCachedSize; 107 int mCachedAddress; 108 CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final int frequency, final boolean isNotAWord, final boolean isBlacklistEntry)109 public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 110 final ArrayList<WeightedString> bigrams, final int frequency, 111 final boolean isNotAWord, final boolean isBlacklistEntry) { 112 mChars = chars; 113 mFrequency = frequency; 114 mShortcutTargets = shortcutTargets; 115 mBigrams = bigrams; 116 mChildren = null; 117 mIsNotAWord = isNotAWord; 118 mIsBlacklistEntry = isBlacklistEntry; 119 } 120 CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final int frequency, final boolean isNotAWord, final boolean isBlacklistEntry, final Node children)121 public CharGroup(final int[] chars, final ArrayList<WeightedString> shortcutTargets, 122 final ArrayList<WeightedString> bigrams, final int frequency, 123 final boolean isNotAWord, final boolean isBlacklistEntry, final Node children) { 124 mChars = chars; 125 mFrequency = frequency; 126 mShortcutTargets = shortcutTargets; 127 mBigrams = bigrams; 128 mChildren = children; 129 mIsNotAWord = isNotAWord; 130 mIsBlacklistEntry = isBlacklistEntry; 131 } 132 addChild(CharGroup n)133 public void addChild(CharGroup n) { 134 if (null == mChildren) { 135 mChildren = new Node(); 136 } 137 mChildren.mData.add(n); 138 } 139 isTerminal()140 public boolean isTerminal() { 141 return NOT_A_TERMINAL != mFrequency; 142 } 143 hasSeveralChars()144 public boolean hasSeveralChars() { 145 assert(mChars.length > 0); 146 return 1 < mChars.length; 147 } 148 149 /** 150 * Adds a word to the bigram list. Updates the frequency if the word already 151 * exists. 152 */ addBigram(final String word, final int frequency)153 public void addBigram(final String word, final int frequency) { 154 if (mBigrams == null) { 155 mBigrams = new ArrayList<WeightedString>(); 156 } 157 WeightedString bigram = getBigram(word); 158 if (bigram != null) { 159 bigram.mFrequency = frequency; 160 } else { 161 bigram = new WeightedString(word, frequency); 162 mBigrams.add(bigram); 163 } 164 } 165 166 /** 167 * Gets the shortcut target for the given word. Returns null if the word is not in the 168 * shortcut list. 169 */ getShortcut(final String word)170 public WeightedString getShortcut(final String word) { 171 // TODO: Don't do a linear search 172 if (mShortcutTargets != null) { 173 final int size = mShortcutTargets.size(); 174 for (int i = 0; i < size; ++i) { 175 WeightedString shortcut = mShortcutTargets.get(i); 176 if (shortcut.mWord.equals(word)) { 177 return shortcut; 178 } 179 } 180 } 181 return null; 182 } 183 184 /** 185 * Gets the bigram for the given word. 186 * Returns null if the word is not in the bigrams list. 187 */ getBigram(final String word)188 public WeightedString getBigram(final String word) { 189 // TODO: Don't do a linear search 190 if (mBigrams != null) { 191 final int size = mBigrams.size(); 192 for (int i = 0; i < size; ++i) { 193 WeightedString bigram = mBigrams.get(i); 194 if (bigram.mWord.equals(word)) { 195 return bigram; 196 } 197 } 198 } 199 return null; 200 } 201 202 /** 203 * Updates the CharGroup with the given properties. Adds the shortcut and bigram lists to 204 * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only 205 * updated if they are higher than the existing ones. 206 */ update(final int frequency, final ArrayList<WeightedString> shortcutTargets, final ArrayList<WeightedString> bigrams, final boolean isNotAWord, final boolean isBlacklistEntry)207 public void update(final int frequency, final ArrayList<WeightedString> shortcutTargets, 208 final ArrayList<WeightedString> bigrams, 209 final boolean isNotAWord, final boolean isBlacklistEntry) { 210 if (frequency > mFrequency) { 211 mFrequency = frequency; 212 } 213 if (shortcutTargets != null) { 214 if (mShortcutTargets == null) { 215 mShortcutTargets = shortcutTargets; 216 } else { 217 final int size = shortcutTargets.size(); 218 for (int i = 0; i < size; ++i) { 219 final WeightedString shortcut = shortcutTargets.get(i); 220 final WeightedString existingShortcut = getShortcut(shortcut.mWord); 221 if (existingShortcut == null) { 222 mShortcutTargets.add(shortcut); 223 } else if (existingShortcut.mFrequency < shortcut.mFrequency) { 224 existingShortcut.mFrequency = shortcut.mFrequency; 225 } 226 } 227 } 228 } 229 if (bigrams != null) { 230 if (mBigrams == null) { 231 mBigrams = bigrams; 232 } else { 233 final int size = bigrams.size(); 234 for (int i = 0; i < size; ++i) { 235 final WeightedString bigram = bigrams.get(i); 236 final WeightedString existingBigram = getBigram(bigram.mWord); 237 if (existingBigram == null) { 238 mBigrams.add(bigram); 239 } else if (existingBigram.mFrequency < bigram.mFrequency) { 240 existingBigram.mFrequency = bigram.mFrequency; 241 } 242 } 243 } 244 } 245 mIsNotAWord = isNotAWord; 246 mIsBlacklistEntry = isBlacklistEntry; 247 } 248 } 249 250 /** 251 * Options global to the dictionary. 252 * 253 * There are no options at the moment, so this class is empty. 254 */ 255 public static final class DictionaryOptions { 256 public final boolean mGermanUmlautProcessing; 257 public final boolean mFrenchLigatureProcessing; 258 public final HashMap<String, String> mAttributes; DictionaryOptions(final HashMap<String, String> attributes, final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing)259 public DictionaryOptions(final HashMap<String, String> attributes, 260 final boolean germanUmlautProcessing, final boolean frenchLigatureProcessing) { 261 mAttributes = attributes; 262 mGermanUmlautProcessing = germanUmlautProcessing; 263 mFrenchLigatureProcessing = frenchLigatureProcessing; 264 } 265 } 266 267 public final DictionaryOptions mOptions; 268 public final Node mRoot; 269 FusionDictionary(final Node root, final DictionaryOptions options)270 public FusionDictionary(final Node root, final DictionaryOptions options) { 271 mRoot = root; 272 mOptions = options; 273 } 274 addOptionAttribute(final String key, final String value)275 public void addOptionAttribute(final String key, final String value) { 276 mOptions.mAttributes.put(key, value); 277 } 278 279 /** 280 * Helper method to convert a String to an int array. 281 */ getCodePoints(final String word)282 static private int[] getCodePoints(final String word) { 283 // TODO: this is a copy-paste of the contents of StringUtils.toCodePointArray, 284 // which is not visible from the makedict package. Factor this code. 285 final char[] characters = word.toCharArray(); 286 final int length = characters.length; 287 final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 288 int codePoint = Character.codePointAt(characters, 0); 289 int dsti = 0; 290 for (int srci = Character.charCount(codePoint); 291 srci < length; srci += Character.charCount(codePoint), ++dsti) { 292 codePoints[dsti] = codePoint; 293 codePoint = Character.codePointAt(characters, srci); 294 } 295 codePoints[dsti] = codePoint; 296 return codePoints; 297 } 298 299 /** 300 * Helper method to add a word as a string. 301 * 302 * This method adds a word to the dictionary with the given frequency. Optional 303 * lists of bigrams and shortcuts can be passed here. For each word inside, 304 * they will be added to the dictionary as necessary. 305 * 306 * @param word the word to add. 307 * @param frequency the frequency of the word, in the range [0..255]. 308 * @param shortcutTargets a list of shortcut targets for this word, or null. 309 * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) 310 */ add(final String word, final int frequency, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord)311 public void add(final String word, final int frequency, 312 final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 313 add(getCodePoints(word), frequency, shortcutTargets, isNotAWord, 314 false /* isBlacklistEntry */); 315 } 316 317 /** 318 * Helper method to add a blacklist entry as a string. 319 * 320 * @param word the word to add as a blacklist entry. 321 * @param shortcutTargets a list of shortcut targets for this word, or null. 322 * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 323 */ addBlacklistEntry(final String word, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord)324 public void addBlacklistEntry(final String word, 325 final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord) { 326 add(getCodePoints(word), 0, shortcutTargets, isNotAWord, true /* isBlacklistEntry */); 327 } 328 329 /** 330 * Sanity check for a node. 331 * 332 * This method checks that all CharGroups in a node are ordered as expected. 333 * If they are, nothing happens. If they aren't, an exception is thrown. 334 */ checkStack(Node node)335 private void checkStack(Node node) { 336 ArrayList<CharGroup> stack = node.mData; 337 int lastValue = -1; 338 for (int i = 0; i < stack.size(); ++i) { 339 int currentValue = stack.get(i).mChars[0]; 340 if (currentValue <= lastValue) 341 throw new RuntimeException("Invalid stack"); 342 else 343 lastValue = currentValue; 344 } 345 } 346 347 /** 348 * Helper method to add a new bigram to the dictionary. 349 * 350 * @param word1 the previous word of the context 351 * @param word2 the next word of the context 352 * @param frequency the bigram frequency 353 */ setBigram(final String word1, final String word2, final int frequency)354 public void setBigram(final String word1, final String word2, final int frequency) { 355 CharGroup charGroup = findWordInTree(mRoot, word1); 356 if (charGroup != null) { 357 final CharGroup charGroup2 = findWordInTree(mRoot, word2); 358 if (charGroup2 == null) { 359 add(getCodePoints(word2), 0, null, false /* isNotAWord */, 360 false /* isBlacklistEntry */); 361 } 362 charGroup.addBigram(word2, frequency); 363 } else { 364 throw new RuntimeException("First word of bigram not found"); 365 } 366 } 367 368 /** 369 * Add a word to this dictionary. 370 * 371 * The shortcuts, if any, have to be in the dictionary already. If they aren't, 372 * an exception is thrown. 373 * 374 * @param word the word, as an int array. 375 * @param frequency the frequency of the word, in the range [0..255]. 376 * @param shortcutTargets an optional list of shortcut targets for this word (null if none). 377 * @param isNotAWord true if this is not a word for spellcheking purposes (shortcut only or so) 378 * @param isBlacklistEntry true if this is a blacklisted word, false otherwise 379 */ add(final int[] word, final int frequency, final ArrayList<WeightedString> shortcutTargets, final boolean isNotAWord, final boolean isBlacklistEntry)380 private void add(final int[] word, final int frequency, 381 final ArrayList<WeightedString> shortcutTargets, 382 final boolean isNotAWord, final boolean isBlacklistEntry) { 383 assert(frequency >= 0 && frequency <= 255); 384 if (word.length >= Constants.Dictionary.MAX_WORD_LENGTH) { 385 MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); 386 return; 387 } 388 389 Node currentNode = mRoot; 390 int charIndex = 0; 391 392 CharGroup currentGroup = null; 393 int differentCharIndex = 0; // Set by the loop to the index of the char that differs 394 int nodeIndex = findIndexOfChar(mRoot, word[charIndex]); 395 while (CHARACTER_NOT_FOUND != nodeIndex) { 396 currentGroup = currentNode.mData.get(nodeIndex); 397 differentCharIndex = compareArrays(currentGroup.mChars, word, charIndex); 398 if (ARRAYS_ARE_EQUAL != differentCharIndex 399 && differentCharIndex < currentGroup.mChars.length) break; 400 if (null == currentGroup.mChildren) break; 401 charIndex += currentGroup.mChars.length; 402 if (charIndex >= word.length) break; 403 currentNode = currentGroup.mChildren; 404 nodeIndex = findIndexOfChar(currentNode, word[charIndex]); 405 } 406 407 if (-1 == nodeIndex) { 408 // No node at this point to accept the word. Create one. 409 final int insertionIndex = findInsertionIndex(currentNode, word[charIndex]); 410 final CharGroup newGroup = new CharGroup( 411 Arrays.copyOfRange(word, charIndex, word.length), 412 shortcutTargets, null /* bigrams */, frequency, isNotAWord, isBlacklistEntry); 413 currentNode.mData.add(insertionIndex, newGroup); 414 if (DBG) checkStack(currentNode); 415 } else { 416 // There is a word with a common prefix. 417 if (differentCharIndex == currentGroup.mChars.length) { 418 if (charIndex + differentCharIndex >= word.length) { 419 // The new word is a prefix of an existing word, but the node on which it 420 // should end already exists as is. Since the old CharNode was not a terminal, 421 // make it one by filling in its frequency and other attributes 422 currentGroup.update(frequency, shortcutTargets, null, isNotAWord, 423 isBlacklistEntry); 424 } else { 425 // The new word matches the full old word and extends past it. 426 // We only have to create a new node and add it to the end of this. 427 final CharGroup newNode = new CharGroup( 428 Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 429 shortcutTargets, null /* bigrams */, frequency, isNotAWord, 430 isBlacklistEntry); 431 currentGroup.mChildren = new Node(); 432 currentGroup.mChildren.mData.add(newNode); 433 } 434 } else { 435 if (0 == differentCharIndex) { 436 // Exact same word. Update the frequency if higher. This will also add the 437 // new shortcuts to the existing shortcut list if it already exists. 438 currentGroup.update(frequency, shortcutTargets, null, 439 currentGroup.mIsNotAWord && isNotAWord, 440 currentGroup.mIsBlacklistEntry || isBlacklistEntry); 441 } else { 442 // Partial prefix match only. We have to replace the current node with a node 443 // containing the current prefix and create two new ones for the tails. 444 Node newChildren = new Node(); 445 final CharGroup newOldWord = new CharGroup( 446 Arrays.copyOfRange(currentGroup.mChars, differentCharIndex, 447 currentGroup.mChars.length), currentGroup.mShortcutTargets, 448 currentGroup.mBigrams, currentGroup.mFrequency, 449 currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry, 450 currentGroup.mChildren); 451 newChildren.mData.add(newOldWord); 452 453 final CharGroup newParent; 454 if (charIndex + differentCharIndex >= word.length) { 455 newParent = new CharGroup( 456 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), 457 shortcutTargets, null /* bigrams */, frequency, 458 isNotAWord, isBlacklistEntry, newChildren); 459 } else { 460 newParent = new CharGroup( 461 Arrays.copyOfRange(currentGroup.mChars, 0, differentCharIndex), 462 null /* shortcutTargets */, null /* bigrams */, -1, 463 false /* isNotAWord */, false /* isBlacklistEntry */, newChildren); 464 final CharGroup newWord = new CharGroup(Arrays.copyOfRange(word, 465 charIndex + differentCharIndex, word.length), 466 shortcutTargets, null /* bigrams */, frequency, 467 isNotAWord, isBlacklistEntry); 468 final int addIndex = word[charIndex + differentCharIndex] 469 > currentGroup.mChars[differentCharIndex] ? 1 : 0; 470 newChildren.mData.add(addIndex, newWord); 471 } 472 currentNode.mData.set(nodeIndex, newParent); 473 } 474 if (DBG) checkStack(currentNode); 475 } 476 } 477 } 478 479 private static int ARRAYS_ARE_EQUAL = 0; 480 481 /** 482 * Custom comparison of two int arrays taken to contain character codes. 483 * 484 * This method compares the two arrays passed as an argument in a lexicographic way, 485 * with an offset in the dst string. 486 * This method does NOT test for the first character. It is taken to be equal. 487 * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 488 * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 489 * strings are equal. This works BECAUSE we don't look at the first character. 490 * 491 * @param src the left-hand side string of the comparison. 492 * @param dst the right-hand side string of the comparison. 493 * @param dstOffset the offset in the right-hand side string. 494 * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 495 */ compareArrays(final int[] src, final int[] dst, int dstOffset)496 private static int compareArrays(final int[] src, final int[] dst, int dstOffset) { 497 // We do NOT test the first char, because we come from a method that already 498 // tested it. 499 for (int i = 1; i < src.length; ++i) { 500 if (dstOffset + i >= dst.length) return i; 501 if (src[i] != dst[dstOffset + i]) return i; 502 } 503 if (dst.length > src.length) return src.length; 504 return ARRAYS_ARE_EQUAL; 505 } 506 507 /** 508 * Helper class that compares and sorts two chargroups according to their 509 * first element only. I repeat: ONLY the first element is considered, the rest 510 * is ignored. 511 * This comparator imposes orderings that are inconsistent with equals. 512 */ 513 static private final class CharGroupComparator implements java.util.Comparator<CharGroup> { 514 @Override compare(CharGroup c1, CharGroup c2)515 public int compare(CharGroup c1, CharGroup c2) { 516 if (c1.mChars[0] == c2.mChars[0]) return 0; 517 return c1.mChars[0] < c2.mChars[0] ? -1 : 1; 518 } 519 } 520 final static private CharGroupComparator CHARGROUP_COMPARATOR = new CharGroupComparator(); 521 522 /** 523 * Finds the insertion index of a character within a node. 524 */ findInsertionIndex(final Node node, int character)525 private static int findInsertionIndex(final Node node, int character) { 526 final ArrayList<CharGroup> data = node.mData; 527 final CharGroup reference = new CharGroup(new int[] { character }, 528 null /* shortcutTargets */, null /* bigrams */, 0, false /* isNotAWord */, 529 false /* isBlacklistEntry */); 530 int result = Collections.binarySearch(data, reference, CHARGROUP_COMPARATOR); 531 return result >= 0 ? result : -result - 1; 532 } 533 534 private static int CHARACTER_NOT_FOUND = -1; 535 536 /** 537 * Find the index of a char in a node, if it exists. 538 * 539 * @param node the node to search in. 540 * @param character the character to search for. 541 * @return the position of the character if it's there, or CHARACTER_NOT_FOUND = -1 else. 542 */ findIndexOfChar(final Node node, int character)543 private static int findIndexOfChar(final Node node, int character) { 544 final int insertionIndex = findInsertionIndex(node, character); 545 if (node.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND; 546 return character == node.mData.get(insertionIndex).mChars[0] ? insertionIndex 547 : CHARACTER_NOT_FOUND; 548 } 549 550 /** 551 * Helper method to find a word in a given branch. 552 */ findWordInTree(Node node, final String s)553 public static CharGroup findWordInTree(Node node, final String s) { 554 int index = 0; 555 final StringBuilder checker = DBG ? new StringBuilder() : null; 556 557 CharGroup currentGroup; 558 final int codePointCountInS = s.codePointCount(0, s.length()); 559 do { 560 int indexOfGroup = findIndexOfChar(node, s.codePointAt(index)); 561 if (CHARACTER_NOT_FOUND == indexOfGroup) return null; 562 currentGroup = node.mData.get(indexOfGroup); 563 564 if (s.length() - index < currentGroup.mChars.length) return null; 565 int newIndex = index; 566 while (newIndex < s.length() && newIndex - index < currentGroup.mChars.length) { 567 if (currentGroup.mChars[newIndex - index] != s.codePointAt(newIndex)) return null; 568 newIndex++; 569 } 570 index = newIndex; 571 572 if (DBG) checker.append(new String(currentGroup.mChars, 0, currentGroup.mChars.length)); 573 if (index < codePointCountInS) { 574 node = currentGroup.mChildren; 575 } 576 } while (null != node && index < codePointCountInS); 577 578 if (index < codePointCountInS) return null; 579 if (!currentGroup.isTerminal()) return null; 580 if (DBG && !s.equals(checker.toString())) return null; 581 return currentGroup; 582 } 583 584 /** 585 * Helper method to find out whether a word is in the dict or not. 586 */ hasWord(final String s)587 public boolean hasWord(final String s) { 588 if (null == s || "".equals(s)) { 589 throw new RuntimeException("Can't search for a null or empty string"); 590 } 591 return null != findWordInTree(mRoot, s); 592 } 593 594 /** 595 * Recursively count the number of character groups in a given branch of the trie. 596 * 597 * @param node the parent node. 598 * @return the number of char groups in all the branch under this node. 599 */ countCharGroups(final Node node)600 public static int countCharGroups(final Node node) { 601 final int nodeSize = node.mData.size(); 602 int size = nodeSize; 603 for (int i = nodeSize - 1; i >= 0; --i) { 604 CharGroup group = node.mData.get(i); 605 if (null != group.mChildren) 606 size += countCharGroups(group.mChildren); 607 } 608 return size; 609 } 610 611 /** 612 * Recursively count the number of nodes in a given branch of the trie. 613 * 614 * @param node the node to count. 615 * @return the number of nodes in this branch. 616 */ countNodes(final Node node)617 public static int countNodes(final Node node) { 618 int size = 1; 619 for (int i = node.mData.size() - 1; i >= 0; --i) { 620 CharGroup group = node.mData.get(i); 621 if (null != group.mChildren) 622 size += countNodes(group.mChildren); 623 } 624 return size; 625 } 626 627 // Recursively find out whether there are any bigrams. 628 // This can be pretty expensive especially if there aren't any (we return as soon 629 // as we find one, so it's much cheaper if there are bigrams) hasBigramsInternal(final Node node)630 private static boolean hasBigramsInternal(final Node node) { 631 if (null == node) return false; 632 for (int i = node.mData.size() - 1; i >= 0; --i) { 633 CharGroup group = node.mData.get(i); 634 if (null != group.mBigrams) return true; 635 if (hasBigramsInternal(group.mChildren)) return true; 636 } 637 return false; 638 } 639 640 /** 641 * Finds out whether there are any bigrams in this dictionary. 642 * 643 * @return true if there is any bigram, false otherwise. 644 */ 645 // TODO: this is expensive especially for large dictionaries without any bigram. 646 // The up side is, this is always accurate and correct and uses no memory. We should 647 // find a more efficient way of doing this, without compromising too much on memory 648 // and ease of use. hasBigrams()649 public boolean hasBigrams() { 650 return hasBigramsInternal(mRoot); 651 } 652 653 // Historically, the tails of the words were going to be merged to save space. 654 // However, that would prevent the code to search for a specific address in log(n) 655 // time so this was abandoned. 656 // The code is still of interest as it does add some compression to any dictionary 657 // that has no need for attributes. Implementations that does not read attributes should be 658 // able to read a dictionary with merged tails. 659 // Also, the following code does support frequencies, as in, it will only merges 660 // tails that share the same frequency. Though it would result in the above loss of 661 // performance while searching by address, it is still technically possible to merge 662 // tails that contain attributes, but this code does not take that into account - it does 663 // not compare attributes and will merge terminals with different attributes regardless. mergeTails()664 public void mergeTails() { 665 MakedictLog.i("Do not merge tails"); 666 return; 667 668 // MakedictLog.i("Merging nodes. Number of nodes : " + countNodes(root)); 669 // MakedictLog.i("Number of groups : " + countCharGroups(root)); 670 // 671 // final HashMap<String, ArrayList<Node>> repository = 672 // new HashMap<String, ArrayList<Node>>(); 673 // mergeTailsInner(repository, root); 674 // 675 // MakedictLog.i("Number of different pseudohashes : " + repository.size()); 676 // int size = 0; 677 // for (ArrayList<Node> a : repository.values()) { 678 // size += a.size(); 679 // } 680 // MakedictLog.i("Number of nodes after merge : " + (1 + size)); 681 // MakedictLog.i("Recursively seen nodes : " + countNodes(root)); 682 } 683 684 // The following methods are used by the deactivated mergeTails() 685 // private static boolean isEqual(Node a, Node b) { 686 // if (null == a && null == b) return true; 687 // if (null == a || null == b) return false; 688 // if (a.data.size() != b.data.size()) return false; 689 // final int size = a.data.size(); 690 // for (int i = size - 1; i >= 0; --i) { 691 // CharGroup aGroup = a.data.get(i); 692 // CharGroup bGroup = b.data.get(i); 693 // if (aGroup.frequency != bGroup.frequency) return false; 694 // if (aGroup.alternates == null && bGroup.alternates != null) return false; 695 // if (aGroup.alternates != null && !aGroup.equals(bGroup.alternates)) return false; 696 // if (!Arrays.equals(aGroup.chars, bGroup.chars)) return false; 697 // if (!isEqual(aGroup.children, bGroup.children)) return false; 698 // } 699 // return true; 700 // } 701 702 // static private HashMap<String, ArrayList<Node>> mergeTailsInner( 703 // final HashMap<String, ArrayList<Node>> map, final Node node) { 704 // final ArrayList<CharGroup> branches = node.data; 705 // final int nodeSize = branches.size(); 706 // for (int i = 0; i < nodeSize; ++i) { 707 // CharGroup group = branches.get(i); 708 // if (null != group.children) { 709 // String pseudoHash = getPseudoHash(group.children); 710 // ArrayList<Node> similarList = map.get(pseudoHash); 711 // if (null == similarList) { 712 // similarList = new ArrayList<Node>(); 713 // map.put(pseudoHash, similarList); 714 // } 715 // boolean merged = false; 716 // for (Node similar : similarList) { 717 // if (isEqual(group.children, similar)) { 718 // group.children = similar; 719 // merged = true; 720 // break; 721 // } 722 // } 723 // if (!merged) { 724 // similarList.add(group.children); 725 // } 726 // mergeTailsInner(map, group.children); 727 // } 728 // } 729 // return map; 730 // } 731 732 // private static String getPseudoHash(final Node node) { 733 // StringBuilder s = new StringBuilder(); 734 // for (CharGroup g : node.data) { 735 // s.append(g.frequency); 736 // for (int ch : g.chars) { 737 // s.append(Character.toChars(ch)); 738 // } 739 // } 740 // return s.toString(); 741 // } 742 743 /** 744 * Iterator to walk through a dictionary. 745 * 746 * This is purely for convenience. 747 */ 748 public static final class DictionaryIterator implements Iterator<Word> { 749 private static final class Position { 750 public Iterator<CharGroup> pos; 751 public int length; Position(ArrayList<CharGroup> groups)752 public Position(ArrayList<CharGroup> groups) { 753 pos = groups.iterator(); 754 length = 0; 755 } 756 } 757 final StringBuilder mCurrentString; 758 final LinkedList<Position> mPositions; 759 DictionaryIterator(ArrayList<CharGroup> root)760 public DictionaryIterator(ArrayList<CharGroup> root) { 761 mCurrentString = new StringBuilder(); 762 mPositions = new LinkedList<Position>(); 763 final Position rootPos = new Position(root); 764 mPositions.add(rootPos); 765 } 766 767 @Override hasNext()768 public boolean hasNext() { 769 for (Position p : mPositions) { 770 if (p.pos.hasNext()) { 771 return true; 772 } 773 } 774 return false; 775 } 776 777 @Override next()778 public Word next() { 779 Position currentPos = mPositions.getLast(); 780 mCurrentString.setLength(mCurrentString.length() - currentPos.length); 781 782 do { 783 if (currentPos.pos.hasNext()) { 784 final CharGroup currentGroup = currentPos.pos.next(); 785 currentPos.length = currentGroup.mChars.length; 786 for (int i : currentGroup.mChars) 787 mCurrentString.append(Character.toChars(i)); 788 if (null != currentGroup.mChildren) { 789 currentPos = new Position(currentGroup.mChildren.mData); 790 mPositions.addLast(currentPos); 791 } 792 if (currentGroup.mFrequency >= 0) 793 return new Word(mCurrentString.toString(), currentGroup.mFrequency, 794 currentGroup.mShortcutTargets, currentGroup.mBigrams, 795 currentGroup.mIsNotAWord, currentGroup.mIsBlacklistEntry); 796 } else { 797 mPositions.removeLast(); 798 currentPos = mPositions.getLast(); 799 mCurrentString.setLength(mCurrentString.length() - mPositions.getLast().length); 800 } 801 } while (true); 802 } 803 804 @Override remove()805 public void remove() { 806 throw new UnsupportedOperationException("Unsupported yet"); 807 } 808 809 } 810 811 /** 812 * Method to return an iterator. 813 * 814 * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 815 * and say : for (Word w : x) {} 816 */ 817 @Override iterator()818 public Iterator<Word> iterator() { 819 return new DictionaryIterator(mRoot.mData); 820 } 821 } 822