1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.makedict; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 import com.android.inputmethod.latin.define.DecoderSpecificConstants; 21 import com.android.inputmethod.latin.makedict.FormatSpec.DictionaryOptions; 22 23 import java.util.ArrayList; 24 import java.util.Arrays; 25 import java.util.Collections; 26 import java.util.Iterator; 27 import java.util.LinkedList; 28 29 /** 30 * A dictionary that can fusion heads and tails of words for more compression. 31 */ 32 @UsedForTesting 33 public final class FusionDictionary implements Iterable<WordProperty> { 34 private static final boolean DBG = MakedictLog.DBG; 35 36 private static int CHARACTER_NOT_FOUND_INDEX = -1; 37 38 /** 39 * A node array of the dictionary, containing several PtNodes. 40 * 41 * A PtNodeArray is but an ordered array of PtNodes, which essentially contain all the 42 * real information. 43 * This class also contains fields to cache size and address, to help with binary 44 * generation. 45 */ 46 public static final class PtNodeArray { 47 ArrayList<PtNode> mData; 48 // To help with binary generation 49 int mCachedSize = Integer.MIN_VALUE; 50 // mCachedAddressBefore/AfterUpdate are helpers for binary dictionary generation. They 51 // always hold the same value except between dictionary address compression, during which 52 // the update process needs to know about both values at the same time. Updating will 53 // update the AfterUpdate value, and the code will move them to BeforeUpdate before 54 // the next update pass. 55 int mCachedAddressBeforeUpdate = Integer.MIN_VALUE; 56 int mCachedAddressAfterUpdate = Integer.MIN_VALUE; 57 int mCachedParentAddress = 0; 58 PtNodeArray()59 public PtNodeArray() { 60 mData = new ArrayList<>(); 61 } PtNodeArray(ArrayList<PtNode> data)62 public PtNodeArray(ArrayList<PtNode> data) { 63 Collections.sort(data, PTNODE_COMPARATOR); 64 mData = data; 65 } 66 } 67 68 /** 69 * PtNode is a group of characters, with probability information, shortcut targets, bigrams, 70 * and children (Pt means Patricia Trie). 71 * 72 * This is the central class of the in-memory representation. A PtNode is what can 73 * be seen as a traditional "trie node", except it can hold several characters at the 74 * same time. A PtNode essentially represents one or several characters in the middle 75 * of the trie tree; as such, it can be a terminal, and it can have children. 76 * In this in-memory representation, whether the PtNode is a terminal or not is represented 77 * by mProbabilityInfo. The PtNode is a terminal when the mProbabilityInfo is not null and the 78 * PtNode is not a terminal when the mProbabilityInfo is null. A terminal may have non-null 79 * shortcuts and/or bigrams, but a non-terminal may not. Moreover, children, if present, 80 * are non-null. 81 */ 82 public static final class PtNode { 83 private static final int NOT_A_TERMINAL = -1; 84 final int mChars[]; 85 ArrayList<WeightedString> mBigrams; 86 // null == mProbabilityInfo indicates this is not a terminal. 87 ProbabilityInfo mProbabilityInfo; 88 int mTerminalId; // NOT_A_TERMINAL == mTerminalId indicates this is not a terminal. 89 PtNodeArray mChildren; 90 boolean mIsNotAWord; // Only a shortcut 91 boolean mIsPossiblyOffensive; 92 // mCachedSize and mCachedAddressBefore/AfterUpdate are helpers for binary dictionary 93 // generation. Before and After always hold the same value except during dictionary 94 // address compression, where the update process needs to know about both values at the 95 // same time. Updating will update the AfterUpdate value, and the code will move them 96 // to BeforeUpdate before the next update pass. 97 // The update process does not need two versions of mCachedSize. 98 int mCachedSize; // The size, in bytes, of this PtNode. 99 int mCachedAddressBeforeUpdate; // The address of this PtNode (before update) 100 int mCachedAddressAfterUpdate; // The address of this PtNode (after update) 101 PtNode(final int[] chars, final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isPossiblyOffensive)102 public PtNode(final int[] chars, final ArrayList<WeightedString> bigrams, 103 final ProbabilityInfo probabilityInfo, final boolean isNotAWord, 104 final boolean isPossiblyOffensive) { 105 mChars = chars; 106 mProbabilityInfo = probabilityInfo; 107 mTerminalId = probabilityInfo == null ? NOT_A_TERMINAL : probabilityInfo.mProbability; 108 mBigrams = bigrams; 109 mChildren = null; 110 mIsNotAWord = isNotAWord; 111 mIsPossiblyOffensive = isPossiblyOffensive; 112 } 113 PtNode(final int[] chars, final ArrayList<WeightedString> bigrams, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isPossiblyOffensive, final PtNodeArray children)114 public PtNode(final int[] chars, final ArrayList<WeightedString> bigrams, 115 final ProbabilityInfo probabilityInfo, final boolean isNotAWord, 116 final boolean isPossiblyOffensive, final PtNodeArray children) { 117 mChars = chars; 118 mProbabilityInfo = probabilityInfo; 119 mBigrams = bigrams; 120 mChildren = children; 121 mIsNotAWord = isNotAWord; 122 mIsPossiblyOffensive = isPossiblyOffensive; 123 } 124 addChild(PtNode n)125 public void addChild(PtNode n) { 126 if (null == mChildren) { 127 mChildren = new PtNodeArray(); 128 } 129 mChildren.mData.add(n); 130 } 131 getTerminalId()132 public int getTerminalId() { 133 return mTerminalId; 134 } 135 isTerminal()136 public boolean isTerminal() { 137 return mProbabilityInfo != null; 138 } 139 getProbability()140 public int getProbability() { 141 return isTerminal() ? mProbabilityInfo.mProbability : NOT_A_TERMINAL; 142 } 143 getIsNotAWord()144 public boolean getIsNotAWord() { 145 return mIsNotAWord; 146 } 147 getIsPossiblyOffensive()148 public boolean getIsPossiblyOffensive() { 149 return mIsPossiblyOffensive; 150 } 151 getBigrams()152 public ArrayList<WeightedString> getBigrams() { 153 // We don't want write permission to escape outside the package, so we return a copy 154 if (null == mBigrams) return null; 155 final ArrayList<WeightedString> copyOfBigrams = new ArrayList<>(mBigrams); 156 return copyOfBigrams; 157 } 158 hasSeveralChars()159 public boolean hasSeveralChars() { 160 assert(mChars.length > 0); 161 return 1 < mChars.length; 162 } 163 164 /** 165 * Adds a word to the bigram list. Updates the probability information if the word already 166 * exists. 167 */ addBigram(final String word, final ProbabilityInfo probabilityInfo)168 public void addBigram(final String word, final ProbabilityInfo probabilityInfo) { 169 if (mBigrams == null) { 170 mBigrams = new ArrayList<>(); 171 } 172 WeightedString bigram = getBigram(word); 173 if (bigram != null) { 174 bigram.mProbabilityInfo = probabilityInfo; 175 } else { 176 bigram = new WeightedString(word, probabilityInfo); 177 mBigrams.add(bigram); 178 } 179 } 180 181 /** 182 * Gets the bigram for the given word. 183 * Returns null if the word is not in the bigrams list. 184 */ getBigram(final String word)185 public WeightedString getBigram(final String word) { 186 // TODO: Don't do a linear search 187 if (mBigrams != null) { 188 final int size = mBigrams.size(); 189 for (int i = 0; i < size; ++i) { 190 WeightedString bigram = mBigrams.get(i); 191 if (bigram.mWord.equals(word)) { 192 return bigram; 193 } 194 } 195 } 196 return null; 197 } 198 199 /** 200 * Updates the PtNode with the given properties. Adds the shortcut and bigram lists to 201 * the existing ones if any. Note: unigram, bigram, and shortcut frequencies are only 202 * updated if they are higher than the existing ones. 203 */ update(final ProbabilityInfo probabilityInfo, final ArrayList<WeightedString> bigrams, final boolean isNotAWord, final boolean isPossiblyOffensive)204 void update(final ProbabilityInfo probabilityInfo, 205 final ArrayList<WeightedString> bigrams, 206 final boolean isNotAWord, final boolean isPossiblyOffensive) { 207 mProbabilityInfo = ProbabilityInfo.max(mProbabilityInfo, probabilityInfo); 208 if (bigrams != null) { 209 if (mBigrams == null) { 210 mBigrams = bigrams; 211 } else { 212 final int size = bigrams.size(); 213 for (int i = 0; i < size; ++i) { 214 final WeightedString bigram = bigrams.get(i); 215 final WeightedString existingBigram = getBigram(bigram.mWord); 216 if (existingBigram == null) { 217 mBigrams.add(bigram); 218 } else { 219 existingBigram.mProbabilityInfo = ProbabilityInfo.max( 220 existingBigram.mProbabilityInfo, bigram.mProbabilityInfo); 221 } 222 } 223 } 224 } 225 mIsNotAWord = isNotAWord; 226 mIsPossiblyOffensive = isPossiblyOffensive; 227 } 228 } 229 230 public final DictionaryOptions mOptions; 231 public final PtNodeArray mRootNodeArray; 232 FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options)233 public FusionDictionary(final PtNodeArray rootNodeArray, final DictionaryOptions options) { 234 mRootNodeArray = rootNodeArray; 235 mOptions = options; 236 } 237 addOptionAttribute(final String key, final String value)238 public void addOptionAttribute(final String key, final String value) { 239 mOptions.mAttributes.put(key, value); 240 } 241 242 /** 243 * Helper method to convert a String to an int array. 244 */ getCodePoints(final String word)245 static int[] getCodePoints(final String word) { 246 // TODO: this is a copy-paste of the old contents of StringUtils.toCodePointArray, 247 // which is not visible from the makedict package. Factor this code. 248 final int length = word.length(); 249 if (length <= 0) return new int[] {}; 250 final char[] characters = word.toCharArray(); 251 final int[] codePoints = new int[Character.codePointCount(characters, 0, length)]; 252 int codePoint = Character.codePointAt(characters, 0); 253 int dsti = 0; 254 for (int srci = Character.charCount(codePoint); 255 srci < length; srci += Character.charCount(codePoint), ++dsti) { 256 codePoints[dsti] = codePoint; 257 codePoint = Character.codePointAt(characters, srci); 258 } 259 codePoints[dsti] = codePoint; 260 return codePoints; 261 } 262 263 /** 264 * Helper method to add a word as a string. 265 * 266 * This method adds a word to the dictionary with the given frequency. Optional 267 * lists of bigrams can be passed here. For each word inside, 268 * they will be added to the dictionary as necessary. 269 * @param word the word to add. 270 * @param probabilityInfo probability information of the word. 271 * @param isNotAWord true if this should not be considered a word (e.g. shortcut only) 272 * @param isPossiblyOffensive true if this word is possibly offensive 273 */ add(final String word, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isPossiblyOffensive)274 public void add(final String word, final ProbabilityInfo probabilityInfo, 275 final boolean isNotAWord, final boolean isPossiblyOffensive) { 276 add(getCodePoints(word), probabilityInfo, isNotAWord, isPossiblyOffensive); 277 } 278 279 /** 280 * Validity check for a PtNode array. 281 * 282 * This method checks that all PtNodes in a node array are ordered as expected. 283 * If they are, nothing happens. If they aren't, an exception is thrown. 284 */ checkStack(PtNodeArray ptNodeArray)285 private static void checkStack(PtNodeArray ptNodeArray) { 286 ArrayList<PtNode> stack = ptNodeArray.mData; 287 int lastValue = -1; 288 for (int i = 0; i < stack.size(); ++i) { 289 int currentValue = stack.get(i).mChars[0]; 290 if (currentValue <= lastValue) { 291 throw new RuntimeException("Invalid stack"); 292 } 293 lastValue = currentValue; 294 } 295 } 296 297 /** 298 * Helper method to add a new bigram to the dictionary. 299 * 300 * @param word0 the previous word of the context 301 * @param word1 the next word of the context 302 * @param probabilityInfo the bigram probability info 303 */ setBigram(final String word0, final String word1, final ProbabilityInfo probabilityInfo)304 public void setBigram(final String word0, final String word1, 305 final ProbabilityInfo probabilityInfo) { 306 PtNode ptNode0 = findWordInTree(mRootNodeArray, word0); 307 if (ptNode0 != null) { 308 final PtNode ptNode1 = findWordInTree(mRootNodeArray, word1); 309 if (ptNode1 == null) { 310 add(getCodePoints(word1), new ProbabilityInfo(0), false /* isNotAWord */, 311 false /* isPossiblyOffensive */); 312 // The PtNode for the first word may have moved by the above insertion, 313 // if word1 and word2 share a common stem that happens not to have been 314 // a cutting point until now. In this case, we need to refresh ptNode. 315 ptNode0 = findWordInTree(mRootNodeArray, word0); 316 } 317 ptNode0.addBigram(word1, probabilityInfo); 318 } else { 319 throw new RuntimeException("First word of bigram not found " + word0); 320 } 321 } 322 323 /** 324 * Add a word to this dictionary. 325 * 326 * The shortcuts, if any, have to be in the dictionary already. If they aren't, 327 * an exception is thrown. 328 * @param word the word, as an int array. 329 * @param probabilityInfo the probability information of the word. 330 * @param isNotAWord true if this is not a word for spellchecking purposes (shortcut only or so) 331 * @param isPossiblyOffensive true if this word is possibly offensive 332 */ add(final int[] word, final ProbabilityInfo probabilityInfo, final boolean isNotAWord, final boolean isPossiblyOffensive)333 private void add(final int[] word, final ProbabilityInfo probabilityInfo, 334 final boolean isNotAWord, final boolean isPossiblyOffensive) { 335 assert(probabilityInfo.mProbability <= FormatSpec.MAX_TERMINAL_FREQUENCY); 336 if (word.length >= DecoderSpecificConstants.DICTIONARY_MAX_WORD_LENGTH) { 337 MakedictLog.w("Ignoring a word that is too long: word.length = " + word.length); 338 return; 339 } 340 341 PtNodeArray currentNodeArray = mRootNodeArray; 342 int charIndex = 0; 343 344 PtNode currentPtNode = null; 345 int differentCharIndex = 0; // Set by the loop to the index of the char that differs 346 int nodeIndex = findIndexOfChar(mRootNodeArray, word[charIndex]); 347 while (CHARACTER_NOT_FOUND_INDEX != nodeIndex) { 348 currentPtNode = currentNodeArray.mData.get(nodeIndex); 349 differentCharIndex = compareCharArrays(currentPtNode.mChars, word, charIndex); 350 if (ARRAYS_ARE_EQUAL != differentCharIndex 351 && differentCharIndex < currentPtNode.mChars.length) break; 352 if (null == currentPtNode.mChildren) break; 353 charIndex += currentPtNode.mChars.length; 354 if (charIndex >= word.length) break; 355 currentNodeArray = currentPtNode.mChildren; 356 nodeIndex = findIndexOfChar(currentNodeArray, word[charIndex]); 357 } 358 359 if (CHARACTER_NOT_FOUND_INDEX == nodeIndex) { 360 // No node at this point to accept the word. Create one. 361 final int insertionIndex = findInsertionIndex(currentNodeArray, word[charIndex]); 362 final PtNode newPtNode = new PtNode(Arrays.copyOfRange(word, charIndex, word.length), 363 null /* bigrams */, probabilityInfo, isNotAWord, 364 isPossiblyOffensive); 365 currentNodeArray.mData.add(insertionIndex, newPtNode); 366 if (DBG) checkStack(currentNodeArray); 367 } else { 368 // There is a word with a common prefix. 369 if (differentCharIndex == currentPtNode.mChars.length) { 370 if (charIndex + differentCharIndex >= word.length) { 371 // The new word is a prefix of an existing word, but the node on which it 372 // should end already exists as is. Since the old PtNode was not a terminal, 373 // make it one by filling in its frequency and other attributes 374 currentPtNode.update(probabilityInfo, null, isNotAWord, 375 isPossiblyOffensive); 376 } else { 377 // The new word matches the full old word and extends past it. 378 // We only have to create a new node and add it to the end of this. 379 final PtNode newNode = new PtNode( 380 Arrays.copyOfRange(word, charIndex + differentCharIndex, word.length), 381 null /* bigrams */, probabilityInfo, 382 isNotAWord, isPossiblyOffensive); 383 currentPtNode.mChildren = new PtNodeArray(); 384 currentPtNode.mChildren.mData.add(newNode); 385 } 386 } else { 387 if (0 == differentCharIndex) { 388 // Exact same word. Update the frequency if higher. This will also add the 389 // new shortcuts to the existing shortcut list if it already exists. 390 currentPtNode.update(probabilityInfo, null, 391 currentPtNode.mIsNotAWord && isNotAWord, 392 currentPtNode.mIsPossiblyOffensive || isPossiblyOffensive); 393 } else { 394 // Partial prefix match only. We have to replace the current node with a node 395 // containing the current prefix and create two new ones for the tails. 396 PtNodeArray newChildren = new PtNodeArray(); 397 final PtNode newOldWord = new PtNode( 398 Arrays.copyOfRange(currentPtNode.mChars, differentCharIndex, 399 currentPtNode.mChars.length), 400 currentPtNode.mBigrams, currentPtNode.mProbabilityInfo, 401 currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive, 402 currentPtNode.mChildren); 403 newChildren.mData.add(newOldWord); 404 405 final PtNode newParent; 406 if (charIndex + differentCharIndex >= word.length) { 407 newParent = new PtNode( 408 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 409 null /* bigrams */, probabilityInfo, 410 isNotAWord, isPossiblyOffensive, newChildren); 411 } else { 412 newParent = new PtNode( 413 Arrays.copyOfRange(currentPtNode.mChars, 0, differentCharIndex), 414 null /* bigrams */, null /* probabilityInfo */, 415 false /* isNotAWord */, false /* isPossiblyOffensive */, 416 newChildren); 417 final PtNode newWord = new PtNode(Arrays.copyOfRange(word, 418 charIndex + differentCharIndex, word.length), 419 null /* bigrams */, probabilityInfo, 420 isNotAWord, isPossiblyOffensive); 421 final int addIndex = word[charIndex + differentCharIndex] 422 > currentPtNode.mChars[differentCharIndex] ? 1 : 0; 423 newChildren.mData.add(addIndex, newWord); 424 } 425 currentNodeArray.mData.set(nodeIndex, newParent); 426 } 427 if (DBG) checkStack(currentNodeArray); 428 } 429 } 430 } 431 432 private static int ARRAYS_ARE_EQUAL = 0; 433 434 /** 435 * Custom comparison of two int arrays taken to contain character codes. 436 * 437 * This method compares the two arrays passed as an argument in a lexicographic way, 438 * with an offset in the dst string. 439 * This method does NOT test for the first character. It is taken to be equal. 440 * I repeat: this method starts the comparison at 1 <> dstOffset + 1. 441 * The index where the strings differ is returned. ARRAYS_ARE_EQUAL = 0 is returned if the 442 * strings are equal. This works BECAUSE we don't look at the first character. 443 * 444 * @param src the left-hand side string of the comparison. 445 * @param dst the right-hand side string of the comparison. 446 * @param dstOffset the offset in the right-hand side string. 447 * @return the index at which the strings differ, or ARRAYS_ARE_EQUAL = 0 if they don't. 448 */ compareCharArrays(final int[] src, final int[] dst, int dstOffset)449 private static int compareCharArrays(final int[] src, final int[] dst, int dstOffset) { 450 // We do NOT test the first char, because we come from a method that already 451 // tested it. 452 for (int i = 1; i < src.length; ++i) { 453 if (dstOffset + i >= dst.length) return i; 454 if (src[i] != dst[dstOffset + i]) return i; 455 } 456 if (dst.length > src.length) return src.length; 457 return ARRAYS_ARE_EQUAL; 458 } 459 460 /** 461 * Helper class that compares and sorts two PtNodes according to their 462 * first element only. I repeat: ONLY the first element is considered, the rest 463 * is ignored. 464 * This comparator imposes orderings that are inconsistent with equals. 465 */ 466 static final class PtNodeComparator implements java.util.Comparator<PtNode> { 467 @Override compare(PtNode p1, PtNode p2)468 public int compare(PtNode p1, PtNode p2) { 469 if (p1.mChars[0] == p2.mChars[0]) return 0; 470 return p1.mChars[0] < p2.mChars[0] ? -1 : 1; 471 } 472 } 473 final static PtNodeComparator PTNODE_COMPARATOR = new PtNodeComparator(); 474 475 /** 476 * Finds the insertion index of a character within a node array. 477 */ findInsertionIndex(final PtNodeArray nodeArray, int character)478 private static int findInsertionIndex(final PtNodeArray nodeArray, int character) { 479 final ArrayList<PtNode> data = nodeArray.mData; 480 final PtNode reference = new PtNode(new int[] { character }, 481 null /* bigrams */, null /* probabilityInfo */, 482 false /* isNotAWord */, false /* isPossiblyOffensive */); 483 int result = Collections.binarySearch(data, reference, PTNODE_COMPARATOR); 484 return result >= 0 ? result : -result - 1; 485 } 486 487 /** 488 * Find the index of a char in a node array, if it exists. 489 * 490 * @param nodeArray the node array to search in. 491 * @param character the character to search for. 492 * @return the position of the character if it's there, or CHARACTER_NOT_FOUND_INDEX = -1 else. 493 */ findIndexOfChar(final PtNodeArray nodeArray, int character)494 private static int findIndexOfChar(final PtNodeArray nodeArray, int character) { 495 final int insertionIndex = findInsertionIndex(nodeArray, character); 496 if (nodeArray.mData.size() <= insertionIndex) return CHARACTER_NOT_FOUND_INDEX; 497 return character == nodeArray.mData.get(insertionIndex).mChars[0] ? insertionIndex 498 : CHARACTER_NOT_FOUND_INDEX; 499 } 500 501 /** 502 * Helper method to find a word in a given branch. 503 */ findWordInTree(final PtNodeArray rootNodeArray, final String string)504 public static PtNode findWordInTree(final PtNodeArray rootNodeArray, final String string) { 505 PtNodeArray nodeArray = rootNodeArray; 506 int index = 0; 507 final StringBuilder checker = DBG ? new StringBuilder() : null; 508 final int[] codePoints = getCodePoints(string); 509 510 PtNode currentPtNode; 511 do { 512 int indexOfGroup = findIndexOfChar(nodeArray, codePoints[index]); 513 if (CHARACTER_NOT_FOUND_INDEX == indexOfGroup) return null; 514 currentPtNode = nodeArray.mData.get(indexOfGroup); 515 516 if (codePoints.length - index < currentPtNode.mChars.length) return null; 517 int newIndex = index; 518 while (newIndex < codePoints.length && newIndex - index < currentPtNode.mChars.length) { 519 if (currentPtNode.mChars[newIndex - index] != codePoints[newIndex]) return null; 520 newIndex++; 521 } 522 index = newIndex; 523 524 if (DBG) { 525 checker.append(new String(currentPtNode.mChars, 0, currentPtNode.mChars.length)); 526 } 527 if (index < codePoints.length) { 528 nodeArray = currentPtNode.mChildren; 529 } 530 } while (null != nodeArray && index < codePoints.length); 531 532 if (index < codePoints.length) return null; 533 if (!currentPtNode.isTerminal()) return null; 534 if (DBG && !string.equals(checker.toString())) return null; 535 return currentPtNode; 536 } 537 538 /** 539 * Helper method to find out whether a word is in the dict or not. 540 */ hasWord(final String s)541 public boolean hasWord(final String s) { 542 if (null == s || "".equals(s)) { 543 throw new RuntimeException("Can't search for a null or empty string"); 544 } 545 return null != findWordInTree(mRootNodeArray, s); 546 } 547 548 /** 549 * Recursively count the number of PtNodes in a given branch of the trie. 550 * 551 * @param nodeArray the parent node. 552 * @return the number of PtNodes in all the branch under this node. 553 */ countPtNodes(final PtNodeArray nodeArray)554 public static int countPtNodes(final PtNodeArray nodeArray) { 555 final int nodeSize = nodeArray.mData.size(); 556 int size = nodeSize; 557 for (int i = nodeSize - 1; i >= 0; --i) { 558 PtNode ptNode = nodeArray.mData.get(i); 559 if (null != ptNode.mChildren) 560 size += countPtNodes(ptNode.mChildren); 561 } 562 return size; 563 } 564 565 /** 566 * Iterator to walk through a dictionary. 567 * 568 * This is purely for convenience. 569 */ 570 public static final class DictionaryIterator implements Iterator<WordProperty> { 571 private static final class Position { 572 public Iterator<PtNode> pos; 573 public int length; Position(ArrayList<PtNode> ptNodes)574 public Position(ArrayList<PtNode> ptNodes) { 575 pos = ptNodes.iterator(); 576 length = 0; 577 } 578 } 579 final StringBuilder mCurrentString; 580 final LinkedList<Position> mPositions; 581 DictionaryIterator(ArrayList<PtNode> ptRoot)582 public DictionaryIterator(ArrayList<PtNode> ptRoot) { 583 mCurrentString = new StringBuilder(); 584 mPositions = new LinkedList<>(); 585 final Position rootPos = new Position(ptRoot); 586 mPositions.add(rootPos); 587 } 588 589 @Override hasNext()590 public boolean hasNext() { 591 for (Position p : mPositions) { 592 if (p.pos.hasNext()) { 593 return true; 594 } 595 } 596 return false; 597 } 598 599 @Override next()600 public WordProperty next() { 601 Position currentPos = mPositions.getLast(); 602 mCurrentString.setLength(currentPos.length); 603 604 do { 605 if (currentPos.pos.hasNext()) { 606 final PtNode currentPtNode = currentPos.pos.next(); 607 currentPos.length = mCurrentString.length(); 608 for (int i : currentPtNode.mChars) { 609 mCurrentString.append(Character.toChars(i)); 610 } 611 if (null != currentPtNode.mChildren) { 612 currentPos = new Position(currentPtNode.mChildren.mData); 613 currentPos.length = mCurrentString.length(); 614 mPositions.addLast(currentPos); 615 } 616 if (currentPtNode.isTerminal()) { 617 return new WordProperty(mCurrentString.toString(), 618 currentPtNode.mProbabilityInfo, currentPtNode.mBigrams, 619 currentPtNode.mIsNotAWord, currentPtNode.mIsPossiblyOffensive); 620 } 621 } else { 622 mPositions.removeLast(); 623 currentPos = mPositions.getLast(); 624 mCurrentString.setLength(mPositions.getLast().length); 625 } 626 } while (true); 627 } 628 629 @Override remove()630 public void remove() { 631 throw new UnsupportedOperationException("Unsupported yet"); 632 } 633 634 } 635 636 /** 637 * Method to return an iterator. 638 * 639 * This method enables Java's enhanced for loop. With this you can have a FusionDictionary x 640 * and say : for (Word w : x) {} 641 */ 642 @Override iterator()643 public Iterator<WordProperty> iterator() { 644 return new DictionaryIterator(mRootNodeArray.mData); 645 } 646 } 647