1 /* 2 ********************************************************************** 3 * Copyright (c) 2006-2007, Google and others. All Rights Reserved. 4 ********************************************************************** 5 * Author: Mark Davis 6 ********************************************************************** 7 */ 8 package org.unicode.cldr.util; 9 10 import java.io.IOException; 11 import java.util.Arrays; 12 import java.util.Comparator; 13 import java.util.Iterator; 14 import java.util.Map; 15 import java.util.Map.Entry; 16 17 import org.unicode.cldr.util.CharUtilities.CharSourceWrapper; 18 import org.unicode.cldr.util.Dictionary.Matcher.Filter; 19 import org.unicode.cldr.util.Dictionary.Matcher.Status; 20 21 import com.ibm.icu.util.ICUUncheckedIOException; 22 23 /** 24 * Provides for detecting all words starting at a given offset, and returning a 25 * value associated with each of the words. Logically, it is backed by a 26 * Map<String,Object> You set the offset you are concerned about, then 27 * call next() until it doesn't return MATCH. Along the way, you will get 28 * results. For example, here is some sample code and results. 29 * 30 * <pre> 31 * System.out.println("Using dictionary: " + dictionary.getMapping()); 32 * System.out.println("Searching in: {" + sampleText + "}"); 33 * // Dictionaries are immutable, so we create a Matcher to search/test text. 34 * Matcher matcher = dictionary.getMatcher(); 35 * matcher.setText(sampleText); 36 * while (true) { 37 * Status status = matcher.find(); 38 * String unique = ""; // only set if needed 39 * if (status == Status.NONE) { 40 * break; 41 * } else if (status == Status.PARTIAL) { 42 * // sets the match value to the "first" partial match 43 * if (matcher.nextUniquePartial()) { 44 * unique = "\tUnique"; 45 * } else { 46 * unique = "\tNot Unique"; 47 * } 48 * } 49 * // Show results 50 * System.out.println("{" + sampleText.substring(0, matcher.getOffset()) + "[[[" 51 * + sampleText.substring(matcher.getOffset(), matcher.getMatchEnd()) 52 * + "]]]" + sampleText.substring(matcher.getMatchEnd()) + "}\t" + status 53 * + " \t{" + matcher.getMatchValue() + "}\t" + unique); 54 * } 55 * </pre> 56 * 57 * Output: 58 * 59 * <pre> 60 * Using dictionary: {any=All, man=Woman, many=Few} 61 * Searching in: {many manners ma} 62 * {[[[man]]]y manners ma} MATCH {Woman} 63 * {[[[many]]] manners ma} MATCH {Few} 64 * {m[[[any]]] manners ma} MATCH {All} 65 * {many [[[man]]]ners ma} MATCH {Woman} 66 * {many [[[man]]]ners ma} PARTIAL {Few} Unique 67 * {many m[[[an]]]ners ma} PARTIAL {All} Unique 68 * {many manners [[[ma]]]} PARTIAL {Woman} Not Unique 69 * </pre> 70 * 71 * When you get a PARTIAL status, the match value is undefined. Often people 72 * will just treat PARTIAL matches as if they were NONE. However, sometimes 73 * people may be interested in finding out whether the text in question is the 74 * truncation or abbreviation of a possible table value. In that case, you can 75 * test further at that point, as is done above. For example, suppose that the 76 * dictionary contains a mapping from English month names to their numeric 77 * values. 78 * <ol> 79 * <li>When we are parsing "Jul 1 1990", we will find a unique partial match at "Jul", with the value 7, for July, and 80 * use it.</li> 81 * <li>When we are parsing "Ju 1 1990", on the other hand, we will find a non-unique partial match at "Ju". While a 82 * value is returned, it is only for one of the possible words ("June" and "July") so (for this application) we can 83 * decide that the parse fails since the month isn't sufficiently distinguished.</li> 84 * </ol> 85 * 86 * @author markdavis 87 * 88 */ 89 public abstract class Dictionary<T> { 90 91 /** 92 * Get the strings from the dictionary. The Map is either read-only or a copy; 93 * that is, modifications do not affect the builder. 94 * 95 * @return 96 */ getMapping()97 public abstract Iterator<Entry<CharSequence, T>> getMapping(); 98 99 /** 100 * Interface for building a new simple StateDictionary. The Map must be sorted 101 * according to Dictionary.CHAR_SEQUENCE_COMPARATOR. It must not contain the key "". 102 * 103 * @param old 104 * @return 105 */ 106 public interface DictionaryBuilder<T> { make(Map<CharSequence, T> source)107 public Dictionary<T> make(Map<CharSequence, T> source); 108 } 109 110 /** 111 * Return more comprehensive debug info if available. 112 * 113 * @return 114 */ debugShow()115 public String debugShow() { 116 return toString(); 117 } 118 getMatcher()119 public abstract Matcher<T> getMatcher(); 120 121 public abstract static class Matcher<T> { 122 protected CharSource text; 123 124 protected int offset; 125 126 protected int matchEnd; 127 128 protected T matchValue; 129 130 // /* 131 // * A Dictionary may also have a builder, that allows the dictionary to be 132 // * built at runtime. 133 // */ 134 // public interface Builder<T> { 135 // /** 136 // * Add strings to the dictionary. It is an error to add the null string, or 137 // * to add a string that is less than a previously added strings. That is, 138 // * the strings must be added in ascending codepoint order. 139 // * 140 // * @param text 141 // * @param result 142 // * @return 143 // */ 144 // public Dictionary<T> getInstance(Map<CharSequence,T> source); 145 // } 146 hasCharAt(int index)147 public boolean hasCharAt(int index) { 148 return text.hasCharAt(index); 149 } 150 151 /** 152 * Set the target text to match within; also resets the offset to zero, calling setOffset(). 153 * 154 * @param text 155 * @return 156 */ setText(CharSource text)157 public Matcher<T> setText(CharSource text) { 158 this.text = text; 159 return setOffset(0); 160 } 161 162 /** 163 * Set the target text to match within; also resets the offset to zero, calling setOffset(). 164 * 165 * @param text 166 * @return 167 */ setText(CharSequence text)168 public Matcher<T> setText(CharSequence text) { 169 this.text = new CharUtilities.CharSourceWrapper<>(text); 170 return setOffset(0); 171 } 172 173 /** 174 * Retrieve the target text to match within. 175 * 176 * @return 177 */ getText()178 public CharSource getText() { 179 return text; 180 } 181 182 /** 183 * Set the position in the target text to match from. Matches only go forwards 184 * in the string. 185 * 186 * @param offset 187 * @return 188 */ setOffset(int offset)189 public Matcher<T> setOffset(int offset) { 190 this.offset = offset; 191 matchEnd = offset; 192 return this; 193 } 194 195 /** 196 * Get the offset from which we are matching. 197 * 198 * @return 199 */ getOffset()200 public int getOffset() { 201 return offset; 202 } 203 204 /** 205 * Get the latest match value after calling next(); see next() for more information. 206 * 207 * @return 208 */ getMatchValue()209 public T getMatchValue() { 210 return matchValue; 211 } 212 213 /** 214 * Get the latest match end (that is, how far we matched); see next() for more information. 215 * 216 * @return 217 */ getMatchEnd()218 public int getMatchEnd() { 219 return matchEnd; 220 } 221 222 /** 223 * Get the text that we are matching in.. 224 * 225 * @return 226 */ getMatchText()227 public CharSource getMatchText() { 228 return text.sublist(offset, matchEnd); 229 } 230 231 /** 232 * The status of a match; see next() for more information. 233 */ 234 public enum Status { 235 /** 236 * There are no further matches at all. 237 */ 238 NONE, 239 /** 240 * There is a partial match for a single item. Example: dictionary contains 241 * "man", text has "max". There will be a partial match after "ma". 242 */ 243 PARTIAL, 244 /** 245 * There is a full match 246 */ 247 MATCH, 248 } 249 250 /** 251 * Finds the next match, and sets the matchValue and matchEnd. Normally you 252 * call in a loop until you get NONE.<br> 253 * <b>Warning: the results of calling next() after getting NONE are 254 * undefined!</b><br> 255 * Here is what happens with the different return values: 256 * <ul> 257 * <li>MATCH: there was an exact match. Its matchValue and matchEnd are set.</li> 258 * <li>PARTIAL: there was a partial match. The matchEnd is set to the furthest point that could be reached 259 * successfully. To get the matchValue, and whether or not there were multiple partial matches, call 260 * nextPartial().</li> 261 * <li>NONE: the matchValue and matchEnd are undefined.</li> 262 * </ul> 263 * 264 * @return MATCH if there is a match. 265 */ next()266 public abstract Status next(); 267 268 public enum Filter { 269 /** 270 * Returns all values: 0 or more MATCH or PARTIAL values (with NONE at end) 271 */ 272 ALL, 273 /** 274 * Only 0 or more MATCH values (with NONE at end) 275 */ 276 MATCHES, 277 /** 278 * Only one value: the longest MATCH value 279 */ 280 LONGEST_MATCH, 281 /** 282 * Only one value: the longest MATCH or PARTIAL 283 */ 284 LONGEST, 285 /** 286 * Only one value: the longest MATCH or unique PARTIAL 287 */ 288 LONGEST_UNIQUE, 289 /** 290 * Only one value: the longest MATCH or PARTIAL (but only the Partial if it is at the end). 291 */ 292 LONGEST_WITH_FINAL_PARTIAL 293 } 294 295 /** 296 * Finds the next match, and sets the matchValue and matchEnd. Normally you 297 * call in a loop until you get NONE.<br> 298 * <b>Warning: the results of calling next() after getting NONE are 299 * undefined!</b><br> 300 * Here is what happens with the different return values: 301 * <ul> 302 * <li>MATCH: there was an exact match. Its matchValue and matchEnd are set.</li> 303 * <li>PARTIAL: there was a partial match. The matchEnd is set to the furthest point that could be reached 304 * successfully. To get the matchValue, and whether or not there were multiple partial matches, call 305 * nextPartial().</li> 306 * <li>NONE: the matchValue and matchEnd are undefined.</li> 307 * </ul> 308 * Question: should we make the name more explicit, like nextAtOffset? Because 309 * this iterates through the matches <i>at</i> an offset, not through offsets. 310 * 311 * @return MATCH if there is a match. 312 */ next(Filter filter)313 public Status next(Filter filter) { 314 if (filter == Filter.ALL) { 315 return next(); 316 } 317 Status lastStatus = Status.NONE; 318 T lastValue = null; 319 int lastEnd = -1; 320 while (true) { 321 Status status = next(); 322 if (status == Status.NONE) { 323 if (lastValue == null) { 324 return status; 325 } 326 matchEnd = lastEnd; 327 matchValue = lastValue; 328 return lastStatus; 329 } else if (status == Status.MATCH 330 || (status == Status.PARTIAL 331 && (filter == Filter.LONGEST 332 || filter == Filter.LONGEST_UNIQUE && nextUniquePartial() 333 || filter == Filter.LONGEST_WITH_FINAL_PARTIAL && !text.hasCharAt(matchEnd)))) { 334 if (filter == Filter.MATCHES) { 335 return status; 336 } 337 lastStatus = status; 338 lastValue = getMatchValue(); 339 lastEnd = matchEnd; 340 } 341 } 342 } 343 344 /** 345 * Determine whether a partial match is singular (there is only one possible 346 * continuation) or multiple (there are different continuations). 347 * <ul> 348 * <li>If so, sets the value of matchValue to that of the string that could have been returned if appropriate 349 * additional characters were inserted at matchEnd.</li> 350 * <li>If not, then the matchValue is indeterminate.</li> 351 * </ul> 352 * <p> 353 * This must only be called immediatedly following a PARTIAL result from next(). 354 * <p> 355 * QUESTION: would it be useful to be able to get all the partial matches?? 356 * 357 * @return true if the partial match is singular, false if it is plural. 358 */ nextUniquePartial()359 public abstract boolean nextUniquePartial(); 360 361 /** 362 * Return the value for a given piece of text, or Integer.MIN_VALUE if there is none. May be overridden for 363 * efficiency. 364 */ get(CharSource text)365 public T get(CharSource text) { 366 setText(text); // set the text to operate on 367 while (true) { 368 Status next1 = next(); 369 if (next1 != Status.MATCH) { 370 return null; 371 } else if (!text.hasCharAt(getMatchEnd())) { 372 return getMatchValue(); 373 } 374 } 375 } 376 377 /** 378 * Advances the offset until next() doesn't return NONE. 379 */ find(Filter filter)380 public Status find(Filter filter) { 381 while (true) { 382 Status status = next(filter); 383 if (status != Status.NONE) { 384 return status; 385 } else if (!text.hasCharAt(getMatchEnd())) { 386 return status; 387 } else { 388 nextOffset(); 389 } 390 } 391 } 392 393 /** 394 * Increment the offset in the text. 395 */ nextOffset()396 public Matcher nextOffset() { 397 return setOffset(++offset); 398 } 399 400 /** 401 * Convert the remaining text (after offset) to the target. Any substring with a MATCH is replaced by the 402 * value.toString(); other characters are copied. Converts to first (shortest) match.. 403 * TODO add parameter to pick between shortest and longest. 404 * 405 * @param target 406 */ convert(Appendable target)407 public Appendable convert(Appendable target) { 408 try { 409 while (text.hasCharAt(offset)) { 410 Status status = next(); 411 if (status != Status.MATCH) { 412 target.append(text.charAt(getOffset())); 413 nextOffset(); 414 } else { 415 target.append(getMatchValue().toString()); 416 setOffset(getMatchEnd()); 417 } 418 } 419 return target; 420 } catch (IOException e) { 421 throw new ICUUncheckedIOException("Internal error", e); 422 } 423 } 424 425 /** 426 * For debugging 427 */ 428 @Override toString()429 public String toString() { 430 return "{offset: " + offset + ", end: " + matchEnd + ", value: " + matchValue + ", text: \"" 431 + text.subSequence(0, text.getKnownLength()) 432 + (text.hasCharAt(text.getKnownLength()) ? "..." : "") + "\"}"; 433 } 434 435 /** 436 * Get the dictionary associated with this matcher. 437 */ getDictionary()438 public abstract Dictionary<T> getDictionary(); 439 440 /** 441 * The offset is not yet at the end. 442 * 443 * @return 444 */ hasMore()445 public boolean hasMore() { 446 return text.hasCharAt(offset); 447 } 448 } 449 450 /** 451 * Return the code point order of two CharSequences. Really ought to be a method on CharSequence. 452 * If the text has isolated surrogates, they will not sort correctly. 453 */ 454 public static final Comparator<CharSequence> CHAR_SEQUENCE_COMPARATOR = new Comparator<CharSequence>() { 455 @Override 456 public int compare(CharSequence o1, CharSequence o2) { 457 return CharUtilities.compare(o1, o2); 458 } 459 }; 460 461 public static class DictionaryCharList<T extends CharSequence> extends CharSourceWrapper<T> { 462 protected boolean failOnLength = false; 463 protected StringBuilder buffer = new StringBuilder(); 464 protected int[] sourceOffsets; 465 protected Matcher<T> matcher; 466 protected boolean atEnd; 467 DictionaryCharList(Dictionary<T> dictionary, T source)468 public DictionaryCharList(Dictionary<T> dictionary, T source) { 469 super(source); 470 matcher = dictionary.getMatcher().setText(source); 471 atEnd = source.length() == 0; 472 sourceOffsets = new int[source.length()]; 473 } 474 475 @Override hasCharAt(int index)476 public boolean hasCharAt(int index) { 477 if (index >= buffer.length()) { 478 if (atEnd) { 479 return false; 480 } 481 growToOffset(index + 1); 482 return index < buffer.length(); 483 } 484 return true; 485 } 486 487 @Override charAt(int index)488 public char charAt(int index) { 489 if (!atEnd && index >= buffer.length()) { 490 growToOffset(index + 1); 491 } 492 return buffer.charAt(index); 493 } 494 495 // sourceOffsets contains valid entries up to buffer.length() + 1. growToOffset(int offset)496 private void growToOffset(int offset) { 497 int length = buffer.length(); 498 while (length < offset && !atEnd) { 499 Status status = matcher.next(Filter.LONGEST_MATCH); 500 int currentOffset = matcher.getOffset(); 501 final int matchEnd = matcher.getMatchEnd(); 502 if (status == Status.MATCH) { 503 final T replacement = matcher.getMatchValue(); 504 setOffsets(length + 1, replacement.length(), matchEnd); 505 buffer.append(replacement); 506 length = buffer.length(); 507 matcher.setOffset(matchEnd); 508 } else { 509 setOffsets(length + 1, 1, currentOffset + 1); 510 buffer.append(source.charAt(currentOffset)); 511 length = buffer.length(); 512 matcher.nextOffset(); 513 } 514 atEnd = matcher.getOffset() >= source.length(); 515 } 516 } 517 setOffsets(final int start, final int count, final int value)518 private void setOffsets(final int start, final int count, final int value) { 519 final int length = start + count; 520 if (sourceOffsets.length < length) { 521 int newCapacity = sourceOffsets.length * 2 + 1; 522 if (newCapacity < length + 50) { 523 newCapacity = length + 50; 524 } 525 int[] temp = new int[newCapacity]; 526 System.arraycopy(sourceOffsets, 0, temp, 0, sourceOffsets.length); 527 sourceOffsets = temp; 528 } 529 for (int i = start; i < length; ++i) { 530 sourceOffsets[i] = value; 531 } 532 } 533 534 @Override fromSourceOffset(int offset)535 public int fromSourceOffset(int offset) { 536 // TODO fix to do real binary search; returning a determinate value. 537 return Arrays.binarySearch(sourceOffsets, offset); 538 } 539 540 @Override toSourceOffset(int offset)541 public int toSourceOffset(int offset) { 542 if (offset > buffer.length()) { 543 growToOffset(offset); 544 if (offset > buffer.length()) { 545 throw new ArrayIndexOutOfBoundsException(offset); 546 } 547 } 548 return sourceOffsets[offset]; 549 } 550 551 @Override subSequence(int start, int end)552 public CharSequence subSequence(int start, int end) { 553 if (!atEnd && end > buffer.length()) { 554 growToOffset(end); 555 } 556 return buffer.subSequence(start, end); 557 } 558 559 @Override sourceSubSequence(int start, int end)560 public CharSequence sourceSubSequence(int start, int end) { 561 // TODO Auto-generated method stub 562 return source.subSequence(toSourceOffset(start), toSourceOffset(end)); 563 } 564 565 @Override getKnownLength()566 public int getKnownLength() { 567 return buffer.length(); 568 } 569 } 570 load(Iterator<Entry<A, B>> input, Map<A, B> output)571 public static <A, B> Map<A, B> load(Iterator<Entry<A, B>> input, Map<A, B> output) { 572 while (input.hasNext()) { 573 Entry<A, B> entry = input.next(); 574 output.put(entry.getKey(), entry.getValue()); 575 } 576 return output; 577 } 578 }