• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  * Copyright (c) 2006-2007, Google and others.  All Rights Reserved.
4  **********************************************************************
5  * Author: Mark Davis
6  **********************************************************************
7  */
8 package org.unicode.cldr.util;
9 
10 import java.io.IOException;
11 import java.util.Arrays;
12 import java.util.Comparator;
13 import java.util.Iterator;
14 import java.util.Map;
15 import java.util.Map.Entry;
16 
17 import org.unicode.cldr.util.CharUtilities.CharSourceWrapper;
18 import org.unicode.cldr.util.Dictionary.Matcher.Filter;
19 import org.unicode.cldr.util.Dictionary.Matcher.Status;
20 
21 import com.ibm.icu.util.ICUUncheckedIOException;
22 
23 /**
24  * Provides for detecting all words starting at a given offset, and returning a
25  * value associated with each of the words. Logically, it is backed by a
26  * Map<String,Object> You set the offset you are concerned about, then
27  * call next() until it doesn't return MATCH. Along the way, you will get
28  * results. For example, here is some sample code and results.
29  *
30  * <pre>
31  * System.out.println(&quot;Using dictionary: &quot; + dictionary.getMapping());
32  * System.out.println(&quot;Searching in: {&quot; + sampleText + &quot;}&quot;);
33  * // Dictionaries are immutable, so we create a Matcher to search/test text.
34  * Matcher matcher = dictionary.getMatcher();
35  * matcher.setText(sampleText);
36  * while (true) {
37  *     Status status = matcher.find();
38  *     String unique = &quot;&quot;; // only set if needed
39  *     if (status == Status.NONE) {
40  *         break;
41  *     } else if (status == Status.PARTIAL) {
42  *         // sets the match value to the &quot;first&quot; partial match
43  *         if (matcher.nextUniquePartial()) {
44  *             unique = &quot;\tUnique&quot;;
45  *         } else {
46  *             unique = &quot;\tNot Unique&quot;;
47  *         }
48  *     }
49  *     // Show results
50  *     System.out.println(&quot;{&quot; + sampleText.substring(0, matcher.getOffset()) + &quot;[[[&quot;
51  *         + sampleText.substring(matcher.getOffset(), matcher.getMatchEnd())
52  *         + &quot;]]]&quot; + sampleText.substring(matcher.getMatchEnd()) + &quot;}\t&quot; + status
53  *         + &quot;  \t{&quot; + matcher.getMatchValue() + &quot;}\t&quot; + unique);
54  * }
55  * </pre>
56  *
57  * Output:
58  *
59  * <pre>
60  *   Using dictionary: {any=All, man=Woman, many=Few}
61  *   Searching in: {many manners ma}
62  *   {[[[man]]]y manners ma} MATCH   {Woman}
63  *   {[[[many]]] manners ma} MATCH   {Few}
64  *   {m[[[any]]] manners ma} MATCH   {All}
65  *   {many [[[man]]]ners ma} MATCH   {Woman}
66  *   {many [[[man]]]ners ma} PARTIAL   {Few}   Unique
67  *   {many m[[[an]]]ners ma} PARTIAL   {All}   Unique
68  *   {many manners [[[ma]]]} PARTIAL   {Woman}   Not Unique
69  * </pre>
70  *
71  * When you get a PARTIAL status, the match value is undefined. Often people
72  * will just treat PARTIAL matches as if they were NONE. However, sometimes
73  * people may be interested in finding out whether the text in question is the
74  * truncation or abbreviation of a possible table value. In that case, you can
75  * test further at that point, as is done above. For example, suppose that the
76  * dictionary contains a mapping from English month names to their numeric
77  * values.
78  * <ol>
79  * <li>When we are parsing "Jul 1 1990", we will find a unique partial match at "Jul", with the value 7, for July, and
80  * use it.</li>
81  * <li>When we are parsing "Ju 1 1990", on the other hand, we will find a non-unique partial match at "Ju". While a
82  * value is returned, it is only for one of the possible words ("June" and "July") so (for this application) we can
83  * decide that the parse fails since the month isn't sufficiently distinguished.</li>
84  * </ol>
85  *
86  * @author markdavis
87  *
88  */
89 public abstract class Dictionary<T> {
90 
91     /**
92      * Get the strings from the dictionary. The Map is either read-only or a copy;
93      * that is, modifications do not affect the builder.
94      *
95      * @return
96      */
getMapping()97     public abstract Iterator<Entry<CharSequence, T>> getMapping();
98 
99     /**
100      * Interface for building a new simple StateDictionary. The Map must be sorted
101      * according to Dictionary.CHAR_SEQUENCE_COMPARATOR. It must not contain the key "".
102      *
103      * @param old
104      * @return
105      */
106     public interface DictionaryBuilder<T> {
make(Map<CharSequence, T> source)107         public Dictionary<T> make(Map<CharSequence, T> source);
108     }
109 
110     /**
111      * Return more comprehensive debug info if available.
112      *
113      * @return
114      */
debugShow()115     public String debugShow() {
116         return toString();
117     }
118 
getMatcher()119     public abstract Matcher<T> getMatcher();
120 
121     public abstract static class Matcher<T> {
122         protected CharSource text;
123 
124         protected int offset;
125 
126         protected int matchEnd;
127 
128         protected T matchValue;
129 
130         // /*
131         // * A Dictionary may also have a builder, that allows the dictionary to be
132         // * built at runtime.
133         // */
134         // public interface Builder<T> {
135         // /**
136         // * Add strings to the dictionary. It is an error to add the null string, or
137         // * to add a string that is less than a previously added strings. That is,
138         // * the strings must be added in ascending codepoint order.
139         // *
140         // * @param text
141         // * @param result
142         // * @return
143         // */
144         // public Dictionary<T> getInstance(Map<CharSequence,T> source);
145         // }
146 
hasCharAt(int index)147         public boolean hasCharAt(int index) {
148             return text.hasCharAt(index);
149         }
150 
151         /**
152          * Set the target text to match within; also resets the offset to zero, calling setOffset().
153          *
154          * @param text
155          * @return
156          */
setText(CharSource text)157         public Matcher<T> setText(CharSource text) {
158             this.text = text;
159             return setOffset(0);
160         }
161 
162         /**
163          * Set the target text to match within; also resets the offset to zero, calling setOffset().
164          *
165          * @param text
166          * @return
167          */
setText(CharSequence text)168         public Matcher<T> setText(CharSequence text) {
169             this.text = new CharUtilities.CharSourceWrapper<>(text);
170             return setOffset(0);
171         }
172 
173         /**
174          * Retrieve the target text to match within.
175          *
176          * @return
177          */
getText()178         public CharSource getText() {
179             return text;
180         }
181 
182         /**
183          * Set the position in the target text to match from. Matches only go forwards
184          * in the string.
185          *
186          * @param offset
187          * @return
188          */
setOffset(int offset)189         public Matcher<T> setOffset(int offset) {
190             this.offset = offset;
191             matchEnd = offset;
192             return this;
193         }
194 
195         /**
196          * Get the offset from which we are matching.
197          *
198          * @return
199          */
getOffset()200         public int getOffset() {
201             return offset;
202         }
203 
204         /**
205          * Get the latest match value after calling next(); see next() for more information.
206          *
207          * @return
208          */
getMatchValue()209         public T getMatchValue() {
210             return matchValue;
211         }
212 
213         /**
214          * Get the latest match end (that is, how far we matched); see next() for more information.
215          *
216          * @return
217          */
getMatchEnd()218         public int getMatchEnd() {
219             return matchEnd;
220         }
221 
222         /**
223          * Get the text that we are matching in..
224          *
225          * @return
226          */
getMatchText()227         public CharSource getMatchText() {
228             return text.sublist(offset, matchEnd);
229         }
230 
231         /**
232          * The status of a match; see next() for more information.
233          */
234         public enum Status {
235             /**
236              * There are no further matches at all.
237              */
238             NONE,
239             /**
240              * There is a partial match for a single item. Example: dictionary contains
241              * "man", text has "max". There will be a partial match after "ma".
242              */
243             PARTIAL,
244             /**
245              * There is a full match
246              */
247             MATCH,
248         }
249 
250         /**
251          * Finds the next match, and sets the matchValue and matchEnd. Normally you
252          * call in a loop until you get NONE.<br>
253          * <b>Warning: the results of calling next() after getting NONE are
254          * undefined!</b><br>
255          * Here is what happens with the different return values:
256          * <ul>
257          * <li>MATCH: there was an exact match. Its matchValue and matchEnd are set.</li>
258          * <li>PARTIAL: there was a partial match. The matchEnd is set to the furthest point that could be reached
259          * successfully. To get the matchValue, and whether or not there were multiple partial matches, call
260          * nextPartial().</li>
261          * <li>NONE: the matchValue and matchEnd are undefined.</li>
262          * </ul>
263          *
264          * @return MATCH if there is a match.
265          */
next()266         public abstract Status next();
267 
268         public enum Filter {
269             /**
270              * Returns all values: 0 or more MATCH or PARTIAL values (with NONE at end)
271              */
272             ALL,
273             /**
274              * Only 0 or more MATCH values (with NONE at end)
275              */
276             MATCHES,
277             /**
278              * Only one value: the longest MATCH value
279              */
280             LONGEST_MATCH,
281             /**
282              * Only one value: the longest MATCH or PARTIAL
283              */
284             LONGEST,
285             /**
286              * Only one value: the longest MATCH or unique PARTIAL
287              */
288             LONGEST_UNIQUE,
289             /**
290              * Only one value: the longest MATCH or PARTIAL (but only the Partial if it is at the end).
291              */
292             LONGEST_WITH_FINAL_PARTIAL
293         }
294 
295         /**
296          * Finds the next match, and sets the matchValue and matchEnd. Normally you
297          * call in a loop until you get NONE.<br>
298          * <b>Warning: the results of calling next() after getting NONE are
299          * undefined!</b><br>
300          * Here is what happens with the different return values:
301          * <ul>
302          * <li>MATCH: there was an exact match. Its matchValue and matchEnd are set.</li>
303          * <li>PARTIAL: there was a partial match. The matchEnd is set to the furthest point that could be reached
304          * successfully. To get the matchValue, and whether or not there were multiple partial matches, call
305          * nextPartial().</li>
306          * <li>NONE: the matchValue and matchEnd are undefined.</li>
307          * </ul>
308          * Question: should we make the name more explicit, like nextAtOffset? Because
309          * this iterates through the matches <i>at</i> an offset, not through offsets.
310          *
311          * @return MATCH if there is a match.
312          */
next(Filter filter)313         public Status next(Filter filter) {
314             if (filter == Filter.ALL) {
315                 return next();
316             }
317             Status lastStatus = Status.NONE;
318             T lastValue = null;
319             int lastEnd = -1;
320             while (true) {
321                 Status status = next();
322                 if (status == Status.NONE) {
323                     if (lastValue == null) {
324                         return status;
325                     }
326                     matchEnd = lastEnd;
327                     matchValue = lastValue;
328                     return lastStatus;
329                 } else if (status == Status.MATCH
330                     || (status == Status.PARTIAL
331                         && (filter == Filter.LONGEST
332                             || filter == Filter.LONGEST_UNIQUE && nextUniquePartial()
333                             || filter == Filter.LONGEST_WITH_FINAL_PARTIAL && !text.hasCharAt(matchEnd)))) {
334                     if (filter == Filter.MATCHES) {
335                         return status;
336                     }
337                     lastStatus = status;
338                     lastValue = getMatchValue();
339                     lastEnd = matchEnd;
340                 }
341             }
342         }
343 
344         /**
345          * Determine whether a partial match is singular (there is only one possible
346          * continuation) or multiple (there are different continuations).
347          * <ul>
348          * <li>If so, sets the value of matchValue to that of the string that could have been returned if appropriate
349          * additional characters were inserted at matchEnd.</li>
350          * <li>If not, then the matchValue is indeterminate.</li>
351          * </ul>
352          * <p>
353          * This must only be called immediatedly following a PARTIAL result from next().
354          * <p>
355          * QUESTION: would it be useful to be able to get all the partial matches??
356          *
357          * @return true if the partial match is singular, false if it is plural.
358          */
nextUniquePartial()359         public abstract boolean nextUniquePartial();
360 
361         /**
362          * Return the value for a given piece of text, or Integer.MIN_VALUE if there is none. May be overridden for
363          * efficiency.
364          */
get(CharSource text)365         public T get(CharSource text) {
366             setText(text); // set the text to operate on
367             while (true) {
368                 Status next1 = next();
369                 if (next1 != Status.MATCH) {
370                     return null;
371                 } else if (!text.hasCharAt(getMatchEnd())) {
372                     return getMatchValue();
373                 }
374             }
375         }
376 
377         /**
378          * Advances the offset until next() doesn't return NONE.
379          */
find(Filter filter)380         public Status find(Filter filter) {
381             while (true) {
382                 Status status = next(filter);
383                 if (status != Status.NONE) {
384                     return status;
385                 } else if (!text.hasCharAt(getMatchEnd())) {
386                     return status;
387                 } else {
388                     nextOffset();
389                 }
390             }
391         }
392 
393         /**
394          * Increment the offset in the text.
395          */
nextOffset()396         public Matcher nextOffset() {
397             return setOffset(++offset);
398         }
399 
400         /**
401          * Convert the remaining text (after offset) to the target. Any substring with a MATCH is replaced by the
402          * value.toString(); other characters are copied. Converts to first (shortest) match..
403          * TODO add parameter to pick between shortest and longest.
404          *
405          * @param target
406          */
convert(Appendable target)407         public Appendable convert(Appendable target) {
408             try {
409                 while (text.hasCharAt(offset)) {
410                     Status status = next();
411                     if (status != Status.MATCH) {
412                         target.append(text.charAt(getOffset()));
413                         nextOffset();
414                     } else {
415                         target.append(getMatchValue().toString());
416                         setOffset(getMatchEnd());
417                     }
418                 }
419                 return target;
420             } catch (IOException e) {
421                 throw new ICUUncheckedIOException("Internal error", e);
422             }
423         }
424 
425         /**
426          * For debugging
427          */
428         @Override
toString()429         public String toString() {
430             return "{offset: " + offset + ", end: " + matchEnd + ", value: " + matchValue + ", text: \""
431                 + text.subSequence(0, text.getKnownLength())
432                 + (text.hasCharAt(text.getKnownLength()) ? "..." : "") + "\"}";
433         }
434 
435         /**
436          * Get the dictionary associated with this matcher.
437          */
getDictionary()438         public abstract Dictionary<T> getDictionary();
439 
440         /**
441          * The offset is not yet at the end.
442          *
443          * @return
444          */
hasMore()445         public boolean hasMore() {
446             return text.hasCharAt(offset);
447         }
448     }
449 
450     /**
451      * Return the code point order of two CharSequences. Really ought to be a method on CharSequence.
452      * If the text has isolated surrogates, they will not sort correctly.
453      */
454     public static final Comparator<CharSequence> CHAR_SEQUENCE_COMPARATOR = new Comparator<CharSequence>() {
455         @Override
456         public int compare(CharSequence o1, CharSequence o2) {
457             return CharUtilities.compare(o1, o2);
458         }
459     };
460 
461     public static class DictionaryCharList<T extends CharSequence> extends CharSourceWrapper<T> {
462         protected boolean failOnLength = false;
463         protected StringBuilder buffer = new StringBuilder();
464         protected int[] sourceOffsets;
465         protected Matcher<T> matcher;
466         protected boolean atEnd;
467 
DictionaryCharList(Dictionary<T> dictionary, T source)468         public DictionaryCharList(Dictionary<T> dictionary, T source) {
469             super(source);
470             matcher = dictionary.getMatcher().setText(source);
471             atEnd = source.length() == 0;
472             sourceOffsets = new int[source.length()];
473         }
474 
475         @Override
hasCharAt(int index)476         public boolean hasCharAt(int index) {
477             if (index >= buffer.length()) {
478                 if (atEnd) {
479                     return false;
480                 }
481                 growToOffset(index + 1);
482                 return index < buffer.length();
483             }
484             return true;
485         }
486 
487         @Override
charAt(int index)488         public char charAt(int index) {
489             if (!atEnd && index >= buffer.length()) {
490                 growToOffset(index + 1);
491             }
492             return buffer.charAt(index);
493         }
494 
495         // sourceOffsets contains valid entries up to buffer.length() + 1.
growToOffset(int offset)496         private void growToOffset(int offset) {
497             int length = buffer.length();
498             while (length < offset && !atEnd) {
499                 Status status = matcher.next(Filter.LONGEST_MATCH);
500                 int currentOffset = matcher.getOffset();
501                 final int matchEnd = matcher.getMatchEnd();
502                 if (status == Status.MATCH) {
503                     final T replacement = matcher.getMatchValue();
504                     setOffsets(length + 1, replacement.length(), matchEnd);
505                     buffer.append(replacement);
506                     length = buffer.length();
507                     matcher.setOffset(matchEnd);
508                 } else {
509                     setOffsets(length + 1, 1, currentOffset + 1);
510                     buffer.append(source.charAt(currentOffset));
511                     length = buffer.length();
512                     matcher.nextOffset();
513                 }
514                 atEnd = matcher.getOffset() >= source.length();
515             }
516         }
517 
setOffsets(final int start, final int count, final int value)518         private void setOffsets(final int start, final int count, final int value) {
519             final int length = start + count;
520             if (sourceOffsets.length < length) {
521                 int newCapacity = sourceOffsets.length * 2 + 1;
522                 if (newCapacity < length + 50) {
523                     newCapacity = length + 50;
524                 }
525                 int[] temp = new int[newCapacity];
526                 System.arraycopy(sourceOffsets, 0, temp, 0, sourceOffsets.length);
527                 sourceOffsets = temp;
528             }
529             for (int i = start; i < length; ++i) {
530                 sourceOffsets[i] = value;
531             }
532         }
533 
534         @Override
fromSourceOffset(int offset)535         public int fromSourceOffset(int offset) {
536             // TODO fix to do real binary search; returning a determinate value.
537             return Arrays.binarySearch(sourceOffsets, offset);
538         }
539 
540         @Override
toSourceOffset(int offset)541         public int toSourceOffset(int offset) {
542             if (offset > buffer.length()) {
543                 growToOffset(offset);
544                 if (offset > buffer.length()) {
545                     throw new ArrayIndexOutOfBoundsException(offset);
546                 }
547             }
548             return sourceOffsets[offset];
549         }
550 
551         @Override
subSequence(int start, int end)552         public CharSequence subSequence(int start, int end) {
553             if (!atEnd && end > buffer.length()) {
554                 growToOffset(end);
555             }
556             return buffer.subSequence(start, end);
557         }
558 
559         @Override
sourceSubSequence(int start, int end)560         public CharSequence sourceSubSequence(int start, int end) {
561             // TODO Auto-generated method stub
562             return source.subSequence(toSourceOffset(start), toSourceOffset(end));
563         }
564 
565         @Override
getKnownLength()566         public int getKnownLength() {
567             return buffer.length();
568         }
569     }
570 
load(Iterator<Entry<A, B>> input, Map<A, B> output)571     public static <A, B> Map<A, B> load(Iterator<Entry<A, B>> input, Map<A, B> output) {
572         while (input.hasNext()) {
573             Entry<A, B> entry = input.next();
574             output.put(entry.getKey(), entry.getValue());
575         }
576         return output;
577     }
578 }