1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2014-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.text.CharacterIterator; 12 import java.util.HashSet; 13 14 import com.ibm.icu.impl.ICUResourceBundle.OpenType; 15 import com.ibm.icu.text.BreakIterator; 16 import com.ibm.icu.text.FilteredBreakIteratorBuilder; 17 import com.ibm.icu.text.UCharacterIterator; 18 import com.ibm.icu.util.BytesTrie; 19 import com.ibm.icu.util.CharsTrie; 20 import com.ibm.icu.util.CharsTrieBuilder; 21 import com.ibm.icu.util.StringTrieBuilder; 22 import com.ibm.icu.util.ULocale; 23 24 /** 25 * @author tomzhang 26 */ 27 public class SimpleFilteredSentenceBreakIterator extends BreakIterator { 28 29 private BreakIterator delegate; 30 private UCharacterIterator text; // TODO(Tom): suffice to move into the local scope in next() ? 31 private CharsTrie backwardsTrie; // i.e. ".srM" for Mrs. 32 private CharsTrie forwardsPartialTrie; // Has ".a" for "a.M." 33 34 /** 35 * @param adoptBreakIterator 36 * break iterator to adopt 37 * @param forwardsPartialTrie 38 * forward & partial char trie to adopt 39 * @param backwardsTrie 40 * backward trie to adopt 41 */ SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, CharsTrie backwardsTrie)42 public SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, 43 CharsTrie backwardsTrie) { 44 this.delegate = adoptBreakIterator; 45 this.forwardsPartialTrie = forwardsPartialTrie; 46 this.backwardsTrie = backwardsTrie; 47 } 48 49 50 /** 51 * Reset the filter from the delegate. 52 */ resetState()53 private final void resetState() { 54 text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone()); 55 } 56 57 /** 58 * Is there an exception at this point? 59 * 60 * @param n 61 * @return 62 */ breakExceptionAt(int n)63 private final boolean breakExceptionAt(int n) { 64 // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt() 65 66 int bestPosn = -1; 67 int bestValue = -1; 68 69 // loops while 'n' points to an exception 70 text.setIndex(n); 71 backwardsTrie.reset(); 72 int uch; 73 74 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") 75 if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here?? 76 // TODO only do this the 1st time? 77 } else { 78 uch = text.nextCodePoint(); 79 } 80 81 BytesTrie.Result r = BytesTrie.Result.INTERMEDIATE_VALUE; 82 83 while ((uch = text.previousCodePoint()) != UCharacterIterator.DONE && // more to consume backwards and.. 84 ((r = backwardsTrie.nextForCodePoint(uch)).hasNext())) {// more in the trie 85 if (r.hasValue()) { // remember the best match so far 86 bestPosn = text.getIndex(); 87 bestValue = backwardsTrie.getValue(); 88 } 89 } 90 91 if (r.matches()) { // exact match? 92 bestValue = backwardsTrie.getValue(); 93 bestPosn = text.getIndex(); 94 } 95 96 if (bestPosn >= 0) { 97 if (bestValue == Builder.MATCH) { // exact match! 98 return true; // Exception here. 99 } else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) { 100 // make sure there's a forward trie 101 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie 102 // to see if it matches something going forward. 103 forwardsPartialTrie.reset(); 104 105 BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE; 106 text.setIndex(bestPosn); // hope that's close .. 107 while ((uch = text.nextCodePoint()) != BreakIterator.DONE 108 && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) { 109 } 110 if (rfwd.matches()) { 111 // Exception here 112 return true; 113 } // else fall through 114 } // else fall through 115 } // else fall through 116 return false; // No exception here. 117 } 118 119 /** 120 * Given that the delegate has already given its "initial" answer, 121 * find the NEXT actual (non-suppressed) break. 122 * @param n initial position from delegate 123 * @return new break position or BreakIterator.DONE 124 */ internalNext(int n)125 private final int internalNext(int n) { 126 if (n == BreakIterator.DONE || // at end or 127 backwardsTrie == null) { // .. no backwards table loaded == no exceptions 128 return n; 129 } 130 resetState(); 131 132 final int textLen = text.getLength(); 133 134 while (n != BreakIterator.DONE && n != textLen) { 135 // outer loop runs once per underlying break (from fDelegate). 136 // loops while 'n' points to an exception. 137 138 if (breakExceptionAt(n)) { 139 // n points to a break exception 140 n = delegate.next(); 141 } else { 142 // no exception at this spot 143 return n; 144 } 145 } 146 return n; //hit underlying DONE or break at end of text 147 } 148 149 /** 150 * Given that the delegate has already given its "initial" answer, 151 * find the PREV actual (non-suppressed) break. 152 * @param n initial position from delegate 153 * @return new break position or BreakIterator.DONE 154 */ internalPrev(int n)155 private final int internalPrev(int n) { 156 if (n == 0 || n == BreakIterator.DONE || // at end or 157 backwardsTrie == null) { // .. no backwards table loaded == no exceptions 158 return n; 159 } 160 resetState(); 161 162 while (n != BreakIterator.DONE && n != 0) { 163 // outer loop runs once per underlying break (from fDelegate). 164 // loops while 'n' points to an exception. 165 166 if (breakExceptionAt(n)) { 167 // n points to a break exception 168 n = delegate.previous(); 169 } else { 170 // no exception at this spot 171 return n; 172 } 173 } 174 return n; //hit underlying DONE or break at end of text 175 } 176 177 @Override equals(Object obj)178 public boolean equals(Object obj) { 179 if (obj == null) 180 return false; 181 if (this == obj) 182 return true; 183 if (getClass() != obj.getClass()) 184 return false; 185 SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj; 186 return delegate.equals(other.delegate) && text.equals(other.text) && backwardsTrie.equals(other.backwardsTrie) 187 && forwardsPartialTrie.equals(other.forwardsPartialTrie); 188 } 189 190 @Override hashCode()191 public int hashCode() { 192 return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) + delegate.hashCode(); 193 } 194 195 @Override clone()196 public Object clone() { 197 SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone(); 198 return other; 199 } 200 201 202 @Override first()203 public int first() { 204 // Don't suppress a break opportunity at the beginning of text. 205 return delegate.first(); 206 } 207 208 @Override preceding(int offset)209 public int preceding(int offset) { 210 return internalPrev(delegate.preceding(offset)); 211 } 212 213 @Override previous()214 public int previous() { 215 return internalPrev(delegate.previous()); 216 } 217 218 @Override current()219 public int current() { 220 return delegate.current(); 221 } 222 223 @Override isBoundary(int offset)224 public boolean isBoundary(int offset) { 225 if(!delegate.isBoundary(offset)) { 226 return false; // No underlying break to suppress? 227 } 228 229 // delegate thinks there's a break… 230 if(backwardsTrie == null) { 231 return true; // no data 232 } 233 234 resetState(); 235 return !breakExceptionAt(offset); // if there's an exception: no break. 236 } 237 238 @Override next()239 public int next() { 240 return internalNext(delegate.next()); 241 } 242 243 @Override next(int n)244 public int next(int n) { 245 return internalNext(delegate.next(n)); 246 } 247 248 @Override following(int offset)249 public int following(int offset) { 250 return internalNext(delegate.following(offset)); 251 } 252 253 @Override last()254 public int last() { 255 // Don't suppress a break opportunity at the end of text. 256 return delegate.last(); 257 } 258 259 @Override getText()260 public CharacterIterator getText() { 261 return delegate.getText(); 262 } 263 264 @Override setText(CharacterIterator newText)265 public void setText(CharacterIterator newText) { 266 delegate.setText(newText); 267 } 268 269 public static class Builder extends FilteredBreakIteratorBuilder { 270 /** 271 * filter set to store all exceptions 272 */ 273 private HashSet<String> filterSet = new HashSet<String>(); 274 275 static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie 276 static final int MATCH = (1 << 1); // < exact match - skip this one. 277 static final int SuppressInReverse = (1 << 0); 278 static final int AddToForward = (1 << 1); 279 280 /** 281 * Create SimpleFilteredBreakIteratorBuilder using given locale 282 * @param loc the locale to get filtered iterators 283 */ Builder(ULocale loc)284 public Builder(ULocale loc) { 285 ICUResourceBundle rb = ICUResourceBundle.getBundleInstance( 286 ICUData.ICU_BRKITR_BASE_NAME, loc, OpenType.LOCALE_ROOT); 287 288 ICUResourceBundle breaks = rb.findWithFallback("exceptions/SentenceBreak"); 289 290 if (breaks != null) { 291 for (int index = 0, size = breaks.getSize(); index < size; ++index) { 292 ICUResourceBundle b = (ICUResourceBundle) breaks.get(index); 293 String br = b.getString(); 294 filterSet.add(br); 295 } 296 } 297 } 298 299 /** 300 * Create SimpleFilteredBreakIteratorBuilder with no exception 301 */ Builder()302 public Builder() { 303 filterSet = new HashSet<String>(); 304 } 305 306 @Override suppressBreakAfter(String str)307 public boolean suppressBreakAfter(String str) { 308 if (filterSet == null) { 309 filterSet = new HashSet<String>(); 310 } 311 return filterSet.add(str); 312 } 313 314 @Override unsuppressBreakAfter(String str)315 public boolean unsuppressBreakAfter(String str) { 316 if (filterSet == null) { 317 return false; 318 } else { 319 return filterSet.remove(str); 320 } 321 } 322 323 @Override build(BreakIterator adoptBreakIterator)324 public BreakIterator build(BreakIterator adoptBreakIterator) { 325 if( filterSet.isEmpty() ) { 326 // Short circuit - nothing to except. 327 return adoptBreakIterator; 328 } 329 330 CharsTrieBuilder builder = new CharsTrieBuilder(); 331 CharsTrieBuilder builder2 = new CharsTrieBuilder(); 332 333 int revCount = 0; 334 int fwdCount = 0; 335 336 int subCount = filterSet.size(); 337 String[] ustrs = new String[subCount]; 338 int[] partials = new int[subCount]; 339 340 CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs. 341 CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M." 342 343 int i = 0; 344 for (String s : filterSet) { 345 ustrs[i] = s; // copy by value? 346 partials[i] = 0; // default: no partial 347 i++; 348 } 349 350 for (i = 0; i < subCount; i++) { 351 int nn = ustrs[i].indexOf('.'); // TODO: non-'.' abbreviations 352 if (nn > -1 && (nn + 1) != ustrs[i].length()) { 353 // is partial. 354 // is it unique? 355 int sameAs = -1; 356 for (int j = 0; j < subCount; j++) { 357 if (j == i) 358 continue; 359 if (ustrs[i].regionMatches(0, ustrs[j], 0, nn + 1)) { 360 if (partials[j] == 0) { // hasn't been processed yet 361 partials[j] = SuppressInReverse | AddToForward; 362 } else if ((partials[j] & SuppressInReverse) != 0) { 363 sameAs = j; // the other entry is already in the reverse table. 364 } 365 } 366 } 367 368 if ((sameAs == -1) && (partials[i] == 0)) { 369 StringBuilder prefix = new StringBuilder(ustrs[i].substring(0, nn + 1)); 370 // first one - add the prefix to the reverse table. 371 prefix.reverse(); 372 builder.add(prefix, PARTIAL); 373 revCount++; 374 partials[i] = SuppressInReverse | AddToForward; 375 } 376 } 377 } 378 379 for (i = 0; i < subCount; i++) { 380 if (partials[i] == 0) { 381 StringBuilder reversed = new StringBuilder(ustrs[i]).reverse(); 382 builder.add(reversed, MATCH); 383 revCount++; 384 } else { 385 // an optimization would be to only add the portion after the '.' 386 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the 387 // forward, 388 // instead of "Ph.D." since we already know the "Ph." part is a match. 389 // would need the trie to be able to hold 0-length strings, though. 390 builder2.add(ustrs[i], MATCH); // forward 391 fwdCount++; 392 } 393 } 394 395 if (revCount > 0) { 396 backwardsTrie = builder.build(StringTrieBuilder.Option.FAST); 397 } 398 399 if (fwdCount > 0) { 400 forwardsPartialTrie = builder2.build(StringTrieBuilder.Option.FAST); 401 } 402 return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie); 403 } 404 } 405 } 406