• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 2014-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import java.text.CharacterIterator;
12 import java.util.HashSet;
13 import java.util.Locale;
14 
15 import com.ibm.icu.impl.ICUResourceBundle.OpenType;
16 import com.ibm.icu.text.BreakIterator;
17 import com.ibm.icu.text.FilteredBreakIteratorBuilder;
18 import com.ibm.icu.text.UCharacterIterator;
19 import com.ibm.icu.util.BytesTrie;
20 import com.ibm.icu.util.CharsTrie;
21 import com.ibm.icu.util.CharsTrieBuilder;
22 import com.ibm.icu.util.ICUCloneNotSupportedException;
23 import com.ibm.icu.util.StringTrieBuilder;
24 import com.ibm.icu.util.ULocale;
25 
26 /**
27  * @author tomzhang
28  */
29 public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
30 
31     private BreakIterator delegate;
32     private UCharacterIterator text; // TODO(Tom): suffice to move into the local scope in next() ?
33     private CharsTrie backwardsTrie; // i.e. ".srM" for Mrs.
34     private CharsTrie forwardsPartialTrie; // Has ".a" for "a.M."
35 
36     /**
37      * @param adoptBreakIterator
38      *            break iterator to adopt
39      * @param forwardsPartialTrie
40      *            forward & partial char trie to adopt
41      * @param backwardsTrie
42      *            backward trie to adopt
43      */
SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, CharsTrie backwardsTrie)44     public SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie,
45             CharsTrie backwardsTrie) {
46         this.delegate = adoptBreakIterator;
47         this.forwardsPartialTrie = forwardsPartialTrie;
48         this.backwardsTrie = backwardsTrie;
49     }
50 
51 
52     /**
53      * Reset the filter from the delegate.
54      */
resetState()55     private final void resetState() {
56         text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone());
57     }
58 
59     /**
60      * Is there an exception at this point?
61      *
62      * @param n the location of the possible break
63      * @return
64      */
breakExceptionAt(int n)65     private final boolean breakExceptionAt(int n) {
66         // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt()
67 
68         int bestPosn = -1;
69         int bestValue = -1;
70 
71         // loops while 'n' points to an exception
72         text.setIndex(n);
73         backwardsTrie.reset();
74         int uch;
75 
76         // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
77         if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
78             // TODO only do this the 1st time?
79         } else {
80             uch = text.nextCodePoint();
81         }
82 
83         while ((uch = text.previousCodePoint()) >= 0) { // more to consume backwards
84             BytesTrie.Result r = backwardsTrie.nextForCodePoint(uch);
85             if (r.hasValue()) { // remember the best match so far
86                 bestPosn = text.getIndex();
87                 bestValue = backwardsTrie.getValue();
88             }
89             if (!r.hasNext()) {
90                 break;
91             }
92         }
93         backwardsTrie.reset(); // for equals() & hashCode()
94 
95         if (bestPosn >= 0) {
96             if (bestValue == Builder.MATCH) { // exact match!
97                 return true; // Exception here.
98             } else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) {
99                 // make sure there's a forward trie
100                 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
101                 // to see if it matches something going forward.
102                 forwardsPartialTrie.reset();
103 
104                 BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE;
105                 text.setIndex(bestPosn); // hope that's close ..
106                 while ((uch = text.nextCodePoint()) != BreakIterator.DONE
107                         && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
108                 }
109                 forwardsPartialTrie.reset(); // for equals() & hashCode()
110                 if (rfwd.matches()) {
111                     // Exception here
112                     return true;
113                 } // else fall through
114             } // else fall through
115         } // else fall through
116         return false; // No exception here.
117     }
118 
119     /**
120      * Given that the delegate has already given its "initial" answer,
121      * find the NEXT actual (non-suppressed) break.
122      * @param n initial position from delegate
123      * @return new break position or BreakIterator.DONE
124      */
internalNext(int n)125     private final int internalNext(int n) {
126         if (n == BreakIterator.DONE || // at end or
127                 backwardsTrie == null) { // .. no backwards table loaded == no exceptions
128             return n;
129         }
130         resetState();
131 
132         final int textLen = text.getLength();
133 
134         while (n != BreakIterator.DONE && n != textLen) {
135             // outer loop runs once per underlying break (from fDelegate).
136             // loops while 'n' points to an exception.
137 
138             if (breakExceptionAt(n)) {
139                 // n points to a break exception
140                 n = delegate.next();
141             } else {
142                 // no exception at this spot
143                 return n;
144             }
145         }
146         return n; //hit underlying DONE or break at end of text
147     }
148 
149     /**
150      * Given that the delegate has already given its "initial" answer,
151      * find the PREV actual (non-suppressed) break.
152      * @param n initial position from delegate
153      * @return new break position or BreakIterator.DONE
154      */
internalPrev(int n)155     private final int internalPrev(int n) {
156         if (n == 0 || n == BreakIterator.DONE || // at end or
157                 backwardsTrie == null) { // .. no backwards table loaded == no exceptions
158             return n;
159         }
160         resetState();
161 
162         while (n != BreakIterator.DONE && n != 0) {
163             // outer loop runs once per underlying break (from fDelegate).
164             // loops while 'n' points to an exception.
165 
166             if (breakExceptionAt(n)) {
167                 // n points to a break exception
168                 n = delegate.previous();
169             } else {
170                 // no exception at this spot
171                 return n;
172             }
173         }
174         return n; //hit underlying DONE or break at end of text
175     }
176 
177     @Override
equals(Object obj)178     public boolean equals(Object obj) {
179         if (obj == null)
180             return false;
181         if (this == obj)
182             return true;
183         if (getClass() != obj.getClass())
184             return false;
185         SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj;
186         // TODO(ICU-21575): CharsTrie.equals() is not defined.
187         // Should compare the underlying data, and can then stop resetting after iteration.
188         return delegate.equals(other.delegate) && text.equals(other.text)
189                 && backwardsTrie.equals(other.backwardsTrie)
190                 && forwardsPartialTrie.equals(other.forwardsPartialTrie);
191     }
192 
193     @Override
hashCode()194     public int hashCode() {
195         // TODO(ICU-21575): CharsTrie.hashCode() is not defined.
196         return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11)
197                 + delegate.hashCode();
198     }
199 
200     @Override
clone()201     public Object clone() {
202         SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone();
203         try {
204             if (delegate != null) {
205                 other.delegate = (BreakIterator) delegate.clone();
206             }
207             if (text != null) {
208                 other.text = (UCharacterIterator) text.clone();
209             }
210             if (backwardsTrie != null) {
211                 other.backwardsTrie = backwardsTrie.clone();
212             }
213             if (forwardsPartialTrie != null) {
214                 other.forwardsPartialTrie = forwardsPartialTrie.clone();
215             }
216         } catch (CloneNotSupportedException e) {
217             throw new ICUCloneNotSupportedException(e);
218         }
219         return other;
220     }
221 
222 
223     @Override
first()224     public int first() {
225         // Don't suppress a break opportunity at the beginning of text.
226         return delegate.first();
227     }
228 
229     @Override
preceding(int offset)230     public int preceding(int offset) {
231         return internalPrev(delegate.preceding(offset));
232     }
233 
234     @Override
previous()235     public int previous() {
236         return internalPrev(delegate.previous());
237     }
238 
239     @Override
current()240     public int current() {
241         return delegate.current();
242     }
243 
244     @Override
isBoundary(int offset)245     public boolean isBoundary(int offset) {
246         if(!delegate.isBoundary(offset)) {
247             return false; // No underlying break to suppress?
248         }
249 
250         // delegate thinks there's a break…
251         if(backwardsTrie == null) {
252             return true; // no data
253         }
254 
255         resetState();
256         return !breakExceptionAt(offset); // if there's an exception: no break.
257     }
258 
259     @Override
next()260     public int next() {
261         return internalNext(delegate.next());
262     }
263 
264     @Override
next(int n)265     public int next(int n) {
266         return internalNext(delegate.next(n));
267     }
268 
269     @Override
following(int offset)270     public int following(int offset) {
271         return internalNext(delegate.following(offset));
272     }
273 
274     @Override
last()275     public int last() {
276         // Don't suppress a break opportunity at the end of text.
277         return delegate.last();
278     }
279 
280     @Override
getText()281     public CharacterIterator getText() {
282         return delegate.getText();
283     }
284 
285     @Override
setText(CharacterIterator newText)286     public void setText(CharacterIterator newText) {
287         delegate.setText(newText);
288     }
289 
290     public static class Builder extends FilteredBreakIteratorBuilder {
291         /**
292          * filter set to store all exceptions
293          */
294         private HashSet<CharSequence> filterSet = new HashSet<>();
295 
296         static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie
297         static final int MATCH = (1 << 1); // < exact match - skip this one.
298         static final int SuppressInReverse = (1 << 0);
299         static final int AddToForward = (1 << 1);
300 
Builder(Locale loc)301         public Builder(Locale loc) {
302             this(ULocale.forLocale(loc));
303         }
304         /**
305          * Create SimpleFilteredBreakIteratorBuilder using given locale
306          * @param loc the locale to get filtered iterators
307          */
Builder(ULocale loc)308         public Builder(ULocale loc) {
309             ICUResourceBundle rb = ICUResourceBundle.getBundleInstance(
310                     ICUData.ICU_BRKITR_BASE_NAME, loc, OpenType.LOCALE_ROOT);
311 
312             ICUResourceBundle breaks = rb.findWithFallback("exceptions/SentenceBreak");
313 
314             if (breaks != null) {
315                 for (int index = 0, size = breaks.getSize(); index < size; ++index) {
316                     ICUResourceBundle b = (ICUResourceBundle) breaks.get(index);
317                     String br = b.getString();
318                     filterSet.add(br);
319                 }
320             }
321         }
322 
323         /**
324          * Create SimpleFilteredBreakIteratorBuilder with no exception
325          */
Builder()326         public Builder() {
327         }
328 
329         @Override
suppressBreakAfter(CharSequence str)330         public boolean suppressBreakAfter(CharSequence str) {
331             return filterSet.add(str);
332         }
333 
334         @Override
unsuppressBreakAfter(CharSequence str)335         public boolean unsuppressBreakAfter(CharSequence str) {
336             return filterSet.remove(str);
337         }
338 
339         @Override
wrapIteratorWithFilter(BreakIterator adoptBreakIterator)340         public BreakIterator wrapIteratorWithFilter(BreakIterator adoptBreakIterator) {
341             if( filterSet.isEmpty() ) {
342                 // Short circuit - nothing to except.
343                 return adoptBreakIterator;
344             }
345 
346             CharsTrieBuilder builder = new CharsTrieBuilder();
347             CharsTrieBuilder builder2 = new CharsTrieBuilder();
348 
349             int revCount = 0;
350             int fwdCount = 0;
351 
352             int subCount = filterSet.size();
353             CharSequence[] ustrs = new CharSequence[subCount];
354             int[] partials = new int[subCount];
355 
356             CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs.
357             CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M."
358 
359             int i = 0;
360             for (CharSequence s : filterSet) {
361                 ustrs[i] = s; // copy by value?
362                 partials[i] = 0; // default: no partial
363                 i++;
364             }
365 
366             for (i = 0; i < subCount; i++) {
367                 String thisStr = ustrs[i].toString(); // TODO: don't cast to String?
368                 int nn = thisStr.indexOf('.'); // TODO: non-'.' abbreviations
369                 if (nn > -1 && (nn + 1) != thisStr.length()) {
370                     // is partial.
371                     // is it unique?
372                     int sameAs = -1;
373                     for (int j = 0; j < subCount; j++) {
374                         if (j == i)
375                             continue;
376                         if (thisStr.regionMatches(0, ustrs[j].toString() /* TODO */, 0, nn + 1)) {
377                             if (partials[j] == 0) { // hasn't been processed yet
378                                 partials[j] = SuppressInReverse | AddToForward;
379                             } else if ((partials[j] & SuppressInReverse) != 0) {
380                                 sameAs = j; // the other entry is already in the reverse table.
381                             }
382                         }
383                     }
384 
385                     if ((sameAs == -1) && (partials[i] == 0)) {
386                         StringBuilder prefix = new StringBuilder(thisStr.substring(0, nn + 1));
387                         // first one - add the prefix to the reverse table.
388                         prefix.reverse();
389                         builder.add(prefix, PARTIAL);
390                         revCount++;
391                         partials[i] = SuppressInReverse | AddToForward;
392                     }
393                 }
394             }
395 
396             for (i = 0; i < subCount; i++) {
397                 final String thisStr = ustrs[i].toString(); // TODO
398                 if (partials[i] == 0) {
399                     StringBuilder reversed = new StringBuilder(thisStr).reverse();
400                     builder.add(reversed, MATCH);
401                     revCount++;
402                 } else {
403                     // an optimization would be to only add the portion after the '.'
404                     // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the
405                     // forward,
406                     // instead of "Ph.D." since we already know the "Ph." part is a match.
407                     // would need the trie to be able to hold 0-length strings, though.
408                     builder2.add(thisStr, MATCH); // forward
409                     fwdCount++;
410                 }
411             }
412 
413             if (revCount > 0) {
414                 backwardsTrie = builder.build(StringTrieBuilder.Option.FAST);
415             }
416 
417             if (fwdCount > 0) {
418                 forwardsPartialTrie = builder2.build(StringTrieBuilder.Option.FAST);
419             }
420             return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie);
421         }
422     }
423 }
424