• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2014-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import java.text.CharacterIterator;
12 import java.util.HashSet;
13 
14 import com.ibm.icu.impl.ICUResourceBundle.OpenType;
15 import com.ibm.icu.text.BreakIterator;
16 import com.ibm.icu.text.FilteredBreakIteratorBuilder;
17 import com.ibm.icu.text.UCharacterIterator;
18 import com.ibm.icu.util.BytesTrie;
19 import com.ibm.icu.util.CharsTrie;
20 import com.ibm.icu.util.CharsTrieBuilder;
21 import com.ibm.icu.util.StringTrieBuilder;
22 import com.ibm.icu.util.ULocale;
23 
24 /**
25  * @author tomzhang
26  */
27 public class SimpleFilteredSentenceBreakIterator extends BreakIterator {
28 
29     private BreakIterator delegate;
30     private UCharacterIterator text; // TODO(Tom): suffice to move into the local scope in next() ?
31     private CharsTrie backwardsTrie; // i.e. ".srM" for Mrs.
32     private CharsTrie forwardsPartialTrie; // Has ".a" for "a.M."
33 
34     /**
35      * @param adoptBreakIterator
36      *            break iterator to adopt
37      * @param forwardsPartialTrie
38      *            forward & partial char trie to adopt
39      * @param backwardsTrie
40      *            backward trie to adopt
41      */
SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, CharsTrie backwardsTrie)42     public SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie,
43             CharsTrie backwardsTrie) {
44         this.delegate = adoptBreakIterator;
45         this.forwardsPartialTrie = forwardsPartialTrie;
46         this.backwardsTrie = backwardsTrie;
47     }
48 
49 
50     /**
51      * Reset the filter from the delegate.
52      */
resetState()53     private final void resetState() {
54         text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone());
55     }
56 
57     /**
58      * Is there an exception at this point?
59      *
60      * @param n
61      * @return
62      */
breakExceptionAt(int n)63     private final boolean breakExceptionAt(int n) {
64         // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt()
65 
66         int bestPosn = -1;
67         int bestValue = -1;
68 
69         // loops while 'n' points to an exception
70         text.setIndex(n);
71         backwardsTrie.reset();
72         int uch;
73 
74         // Assume a space is following the '.' (so we handle the case: "Mr. /Brown")
75         if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here??
76             // TODO only do this the 1st time?
77         } else {
78             uch = text.nextCodePoint();
79         }
80 
81         BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE;
82 
83         while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and..
84                 ((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie
85             if (r.hasValue()) { // remember the best match so far
86                 bestPosn = text.getIndex();
87                 bestValue = backwardsTrie.getValue();
88             }
89         }
90 
91         if (r.matches()) { // exact match?
92             bestValue = backwardsTrie.getValue();
93             bestPosn = text.getIndex();
94         }
95 
96         if (bestPosn >= 0) {
97             if (bestValue == Builder.MATCH) { // exact match!
98                 return true; // Exception here.
99             } else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) {
100                 // make sure there's a forward trie
101                 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie
102                 // to see if it matches something going forward.
103                 forwardsPartialTrie.reset();
104 
105                 BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE;
106                 text.setIndex(bestPosn); // hope that's close ..
107                 while ((uch = text.nextCodePoint()) != BreakIterator.DONE
108                         && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) {
109                 }
110                 if (rfwd.matches()) {
111                     // Exception here
112                     return true;
113                 } // else fall through
114             } // else fall through
115         } // else fall through
116         return false; // No exception here.
117     }
118 
119     /**
120      * Given that the delegate has already given its "initial" answer,
121      * find the NEXT actual (non-suppressed) break.
122      * @param n initial position from delegate
123      * @return new break position or BreakIterator.DONE
124      */
internalNext(int n)125     private final int internalNext(int n) {
126         if (n == BreakIterator.DONE || // at end or
127                 backwardsTrie == null) { // .. no backwards table loaded == no exceptions
128             return n;
129         }
130         resetState();
131 
132         final int textLen = text.getLength();
133 
134         while (n != BreakIterator.DONE && n != textLen) {
135             // outer loop runs once per underlying break (from fDelegate).
136             // loops while 'n' points to an exception.
137 
138             if (breakExceptionAt(n)) {
139                 // n points to a break exception
140                 n = delegate.next();
141             } else {
142                 // no exception at this spot
143                 return n;
144             }
145         }
146         return n; //hit underlying DONE or break at end of text
147     }
148 
149     /**
150      * Given that the delegate has already given its "initial" answer,
151      * find the PREV actual (non-suppressed) break.
152      * @param n initial position from delegate
153      * @return new break position or BreakIterator.DONE
154      */
internalPrev(int n)155     private final int internalPrev(int n) {
156         if (n == 0 || n == BreakIterator.DONE || // at end or
157                 backwardsTrie == null) { // .. no backwards table loaded == no exceptions
158             return n;
159         }
160         resetState();
161 
162         while (n != BreakIterator.DONE && n != 0) {
163             // outer loop runs once per underlying break (from fDelegate).
164             // loops while 'n' points to an exception.
165 
166             if (breakExceptionAt(n)) {
167                 // n points to a break exception
168                 n = delegate.previous();
169             } else {
170                 // no exception at this spot
171                 return n;
172             }
173         }
174         return n; //hit underlying DONE or break at end of text
175     }
176 
177     @Override
equals(Object obj)178     public boolean equals(Object obj) {
179         if (obj == null)
180             return false;
181         if (this == obj)
182             return true;
183         if (getClass() != obj.getClass())
184             return false;
185         SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj;
186         return delegate.equals(other.delegate) && text.equals(other.text) && backwardsTrie.equals(other.backwardsTrie)
187                 && forwardsPartialTrie.equals(other.forwardsPartialTrie);
188     }
189 
190     @Override
hashCode()191     public int hashCode() {
192         return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) + delegate.hashCode();
193     }
194 
195     @Override
clone()196     public Object clone() {
197         SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone();
198         return other;
199     }
200 
201 
202     @Override
first()203     public int first() {
204         // Don't suppress a break opportunity at the beginning of text.
205         return delegate.first();
206     }
207 
208     @Override
preceding(int offset)209     public int preceding(int offset) {
210         return internalPrev(delegate.preceding(offset));
211     }
212 
213     @Override
previous()214     public int previous() {
215         return internalPrev(delegate.previous());
216     }
217 
218     @Override
current()219     public int current() {
220         return delegate.current();
221     }
222 
223     @Override
isBoundary(int offset)224     public boolean isBoundary(int offset) {
225         if(!delegate.isBoundary(offset)) {
226             return false; // No underlying break to suppress?
227         }
228 
229         // delegate thinks there's a break…
230         if(backwardsTrie == null) {
231             return true; // no data
232         }
233 
234         resetState();
235         return !breakExceptionAt(offset); // if there's an exception: no break.
236     }
237 
238     @Override
next()239     public int next() {
240         return internalNext(delegate.next());
241     }
242 
243     @Override
next(int n)244     public int next(int n) {
245         return internalNext(delegate.next(n));
246     }
247 
248     @Override
following(int offset)249     public int following(int offset) {
250         return internalNext(delegate.following(offset));
251     }
252 
253     @Override
last()254     public int last() {
255         // Don't suppress a break opportunity at the end of text.
256         return delegate.last();
257     }
258 
259     @Override
getText()260     public CharacterIterator getText() {
261         return delegate.getText();
262     }
263 
264     @Override
setText(CharacterIterator newText)265     public void setText(CharacterIterator newText) {
266         delegate.setText(newText);
267     }
268 
269     public static class Builder extends FilteredBreakIteratorBuilder {
270         /**
271          * filter set to store all exceptions
272          */
273         private HashSet<String> filterSet = new HashSet<String>();
274 
275         static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie
276         static final int MATCH = (1 << 1); // < exact match - skip this one.
277         static final int SuppressInReverse = (1 << 0);
278         static final int AddToForward = (1 << 1);
279 
280         /**
281          * Create SimpleFilteredBreakIteratorBuilder using given locale
282          * @param loc the locale to get filtered iterators
283          */
Builder(ULocale loc)284         public Builder(ULocale loc) {
285             ICUResourceBundle rb = ICUResourceBundle.getBundleInstance(
286                     ICUData.ICU_BRKITR_BASE_NAME, loc, OpenType.LOCALE_ROOT);
287 
288             ICUResourceBundle breaks = rb.findWithFallback("exceptions/SentenceBreak");
289 
290             if (breaks != null) {
291                 for (int index = 0, size = breaks.getSize(); index < size; ++index) {
292                     ICUResourceBundle b = (ICUResourceBundle) breaks.get(index);
293                     String br = b.getString();
294                     filterSet.add(br);
295                 }
296             }
297         }
298 
299         /**
300          * Create SimpleFilteredBreakIteratorBuilder with no exception
301          */
Builder()302         public Builder() {
303             filterSet = new HashSet<String>();
304         }
305 
306         @Override
suppressBreakAfter(String str)307         public boolean suppressBreakAfter(String str) {
308             if (filterSet == null) {
309                 filterSet = new HashSet<String>();
310             }
311             return filterSet.add(str);
312         }
313 
314         @Override
unsuppressBreakAfter(String str)315         public boolean unsuppressBreakAfter(String str) {
316             if (filterSet == null) {
317                 return false;
318             } else {
319                 return filterSet.remove(str);
320             }
321         }
322 
323         @Override
build(BreakIterator adoptBreakIterator)324         public BreakIterator build(BreakIterator adoptBreakIterator) {
325             if( filterSet.isEmpty() ) {
326                 // Short circuit - nothing to except.
327                 return adoptBreakIterator;
328             }
329 
330             CharsTrieBuilder builder = new CharsTrieBuilder();
331             CharsTrieBuilder builder2 = new CharsTrieBuilder();
332 
333             int revCount = 0;
334             int fwdCount = 0;
335 
336             int subCount = filterSet.size();
337             String[] ustrs = new String[subCount];
338             int[] partials = new int[subCount];
339 
340             CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs.
341             CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M."
342 
343             int i = 0;
344             for (String s : filterSet) {
345                 ustrs[i] = s; // copy by value?
346                 partials[i] = 0; // default: no partial
347                 i++;
348             }
349 
350             for (i = 0; i < subCount; i++) {
351                 int nn = ustrs[i].indexOf('.'); // TODO: non-'.' abbreviations
352                 if (nn > -1 && (nn + 1) != ustrs[i].length()) {
353                     // is partial.
354                     // is it unique?
355                     int sameAs = -1;
356                     for (int j = 0; j < subCount; j++) {
357                         if (j == i)
358                             continue;
359                         if (ustrs[i].regionMatches(0, ustrs[j], 0, nn + 1)) {
360                             if (partials[j] == 0) { // hasn't been processed yet
361                                 partials[j] = SuppressInReverse | AddToForward;
362                             } else if ((partials[j] & SuppressInReverse) != 0) {
363                                 sameAs = j; // the other entry is already in the reverse table.
364                             }
365                         }
366                     }
367 
368                     if ((sameAs == -1) && (partials[i] == 0)) {
369                         StringBuilder prefix = new StringBuilder(ustrs[i].substring(0, nn + 1));
370                         // first one - add the prefix to the reverse table.
371                         prefix.reverse();
372                         builder.add(prefix, PARTIAL);
373                         revCount++;
374                         partials[i] = SuppressInReverse | AddToForward;
375                     }
376                 }
377             }
378 
379             for (i = 0; i < subCount; i++) {
380                 if (partials[i] == 0) {
381                     StringBuilder reversed = new StringBuilder(ustrs[i]).reverse();
382                     builder.add(reversed, MATCH);
383                     revCount++;
384                 } else {
385                     // an optimization would be to only add the portion after the '.'
386                     // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the
387                     // forward,
388                     // instead of "Ph.D." since we already know the "Ph." part is a match.
389                     // would need the trie to be able to hold 0-length strings, though.
390                     builder2.add(ustrs[i], MATCH); // forward
391                     fwdCount++;
392                 }
393             }
394 
395             if (revCount > 0) {
396                 backwardsTrie = builder.build(StringTrieBuilder.Option.FAST);
397             }
398 
399             if (fwdCount > 0) {
400                 forwardsPartialTrie = builder2.build(StringTrieBuilder.Option.FAST);
401             }
402             return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie);
403         }
404     }
405 }
406