1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2014-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.text.CharacterIterator; 12 import java.util.HashSet; 13 import java.util.Locale; 14 15 import com.ibm.icu.impl.ICUResourceBundle.OpenType; 16 import com.ibm.icu.text.BreakIterator; 17 import com.ibm.icu.text.FilteredBreakIteratorBuilder; 18 import com.ibm.icu.text.UCharacterIterator; 19 import com.ibm.icu.util.BytesTrie; 20 import com.ibm.icu.util.CharsTrie; 21 import com.ibm.icu.util.CharsTrieBuilder; 22 import com.ibm.icu.util.ICUCloneNotSupportedException; 23 import com.ibm.icu.util.StringTrieBuilder; 24 import com.ibm.icu.util.ULocale; 25 26 /** 27 * @author tomzhang 28 */ 29 public class SimpleFilteredSentenceBreakIterator extends BreakIterator { 30 31 private BreakIterator delegate; 32 private UCharacterIterator text; // TODO(Tom): suffice to move into the local scope in next() ? 33 private CharsTrie backwardsTrie; // i.e. ".srM" for Mrs. 34 private CharsTrie forwardsPartialTrie; // Has ".a" for "a.M." 35 36 /** 37 * @param adoptBreakIterator 38 * break iterator to adopt 39 * @param forwardsPartialTrie 40 * forward & partial char trie to adopt 41 * @param backwardsTrie 42 * backward trie to adopt 43 */ SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, CharsTrie backwardsTrie)44 public SimpleFilteredSentenceBreakIterator(BreakIterator adoptBreakIterator, CharsTrie forwardsPartialTrie, 45 CharsTrie backwardsTrie) { 46 this.delegate = adoptBreakIterator; 47 this.forwardsPartialTrie = forwardsPartialTrie; 48 this.backwardsTrie = backwardsTrie; 49 } 50 51 52 /** 53 * Reset the filter from the delegate. 54 */ resetState()55 private final void resetState() { 56 text = UCharacterIterator.getInstance((CharacterIterator) delegate.getText().clone()); 57 } 58 59 /** 60 * Is there an exception at this point? 61 * 62 * @param n the location of the possible break 63 * @return 64 */ breakExceptionAt(int n)65 private final boolean breakExceptionAt(int n) { 66 // Note: the C++ version of this function is SimpleFilteredSentenceBreakIterator::breakExceptionAt() 67 68 int bestPosn = -1; 69 int bestValue = -1; 70 71 // loops while 'n' points to an exception 72 text.setIndex(n); 73 backwardsTrie.reset(); 74 int uch; 75 76 // Assume a space is following the '.' (so we handle the case: "Mr. /Brown") 77 if ((uch = text.previousCodePoint()) == ' ') { // TODO: skip a class of chars here?? 78 // TODO only do this the 1st time? 79 } else { 80 uch = text.nextCodePoint(); 81 } 82 83 while ((uch = text.previousCodePoint()) >= 0) { // more to consume backwards 84 BytesTrie.Result r = backwardsTrie.nextForCodePoint(uch); 85 if (r.hasValue()) { // remember the best match so far 86 bestPosn = text.getIndex(); 87 bestValue = backwardsTrie.getValue(); 88 } 89 if (!r.hasNext()) { 90 break; 91 } 92 } 93 backwardsTrie.reset(); // for equals() & hashCode() 94 95 if (bestPosn >= 0) { 96 if (bestValue == Builder.MATCH) { // exact match! 97 return true; // Exception here. 98 } else if (bestValue == Builder.PARTIAL && forwardsPartialTrie != null) { 99 // make sure there's a forward trie 100 // We matched the "Ph." in "Ph.D." - now we need to run everything through the forwards trie 101 // to see if it matches something going forward. 102 forwardsPartialTrie.reset(); 103 104 BytesTrie.Result rfwd = BytesTrie.Result.INTERMEDIATE_VALUE; 105 text.setIndex(bestPosn); // hope that's close .. 106 while ((uch = text.nextCodePoint()) != BreakIterator.DONE 107 && ((rfwd = forwardsPartialTrie.nextForCodePoint(uch)).hasNext())) { 108 } 109 forwardsPartialTrie.reset(); // for equals() & hashCode() 110 if (rfwd.matches()) { 111 // Exception here 112 return true; 113 } // else fall through 114 } // else fall through 115 } // else fall through 116 return false; // No exception here. 117 } 118 119 /** 120 * Given that the delegate has already given its "initial" answer, 121 * find the NEXT actual (non-suppressed) break. 122 * @param n initial position from delegate 123 * @return new break position or BreakIterator.DONE 124 */ internalNext(int n)125 private final int internalNext(int n) { 126 if (n == BreakIterator.DONE || // at end or 127 backwardsTrie == null) { // .. no backwards table loaded == no exceptions 128 return n; 129 } 130 resetState(); 131 132 final int textLen = text.getLength(); 133 134 while (n != BreakIterator.DONE && n != textLen) { 135 // outer loop runs once per underlying break (from fDelegate). 136 // loops while 'n' points to an exception. 137 138 if (breakExceptionAt(n)) { 139 // n points to a break exception 140 n = delegate.next(); 141 } else { 142 // no exception at this spot 143 return n; 144 } 145 } 146 return n; //hit underlying DONE or break at end of text 147 } 148 149 /** 150 * Given that the delegate has already given its "initial" answer, 151 * find the PREV actual (non-suppressed) break. 152 * @param n initial position from delegate 153 * @return new break position or BreakIterator.DONE 154 */ internalPrev(int n)155 private final int internalPrev(int n) { 156 if (n == 0 || n == BreakIterator.DONE || // at end or 157 backwardsTrie == null) { // .. no backwards table loaded == no exceptions 158 return n; 159 } 160 resetState(); 161 162 while (n != BreakIterator.DONE && n != 0) { 163 // outer loop runs once per underlying break (from fDelegate). 164 // loops while 'n' points to an exception. 165 166 if (breakExceptionAt(n)) { 167 // n points to a break exception 168 n = delegate.previous(); 169 } else { 170 // no exception at this spot 171 return n; 172 } 173 } 174 return n; //hit underlying DONE or break at end of text 175 } 176 177 @Override equals(Object obj)178 public boolean equals(Object obj) { 179 if (obj == null) 180 return false; 181 if (this == obj) 182 return true; 183 if (getClass() != obj.getClass()) 184 return false; 185 SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) obj; 186 // TODO(ICU-21575): CharsTrie.equals() is not defined. 187 // Should compare the underlying data, and can then stop resetting after iteration. 188 return delegate.equals(other.delegate) && text.equals(other.text) 189 && backwardsTrie.equals(other.backwardsTrie) 190 && forwardsPartialTrie.equals(other.forwardsPartialTrie); 191 } 192 193 @Override hashCode()194 public int hashCode() { 195 // TODO(ICU-21575): CharsTrie.hashCode() is not defined. 196 return (forwardsPartialTrie.hashCode() * 39) + (backwardsTrie.hashCode() * 11) 197 + delegate.hashCode(); 198 } 199 200 @Override clone()201 public Object clone() { 202 SimpleFilteredSentenceBreakIterator other = (SimpleFilteredSentenceBreakIterator) super.clone(); 203 try { 204 if (delegate != null) { 205 other.delegate = (BreakIterator) delegate.clone(); 206 } 207 if (text != null) { 208 other.text = (UCharacterIterator) text.clone(); 209 } 210 if (backwardsTrie != null) { 211 other.backwardsTrie = backwardsTrie.clone(); 212 } 213 if (forwardsPartialTrie != null) { 214 other.forwardsPartialTrie = forwardsPartialTrie.clone(); 215 } 216 } catch (CloneNotSupportedException e) { 217 throw new ICUCloneNotSupportedException(e); 218 } 219 return other; 220 } 221 222 223 @Override first()224 public int first() { 225 // Don't suppress a break opportunity at the beginning of text. 226 return delegate.first(); 227 } 228 229 @Override preceding(int offset)230 public int preceding(int offset) { 231 return internalPrev(delegate.preceding(offset)); 232 } 233 234 @Override previous()235 public int previous() { 236 return internalPrev(delegate.previous()); 237 } 238 239 @Override current()240 public int current() { 241 return delegate.current(); 242 } 243 244 @Override isBoundary(int offset)245 public boolean isBoundary(int offset) { 246 if(!delegate.isBoundary(offset)) { 247 return false; // No underlying break to suppress? 248 } 249 250 // delegate thinks there's a break… 251 if(backwardsTrie == null) { 252 return true; // no data 253 } 254 255 resetState(); 256 return !breakExceptionAt(offset); // if there's an exception: no break. 257 } 258 259 @Override next()260 public int next() { 261 return internalNext(delegate.next()); 262 } 263 264 @Override next(int n)265 public int next(int n) { 266 return internalNext(delegate.next(n)); 267 } 268 269 @Override following(int offset)270 public int following(int offset) { 271 return internalNext(delegate.following(offset)); 272 } 273 274 @Override last()275 public int last() { 276 // Don't suppress a break opportunity at the end of text. 277 return delegate.last(); 278 } 279 280 @Override getText()281 public CharacterIterator getText() { 282 return delegate.getText(); 283 } 284 285 @Override setText(CharacterIterator newText)286 public void setText(CharacterIterator newText) { 287 delegate.setText(newText); 288 } 289 290 public static class Builder extends FilteredBreakIteratorBuilder { 291 /** 292 * filter set to store all exceptions 293 */ 294 private HashSet<CharSequence> filterSet = new HashSet<>(); 295 296 static final int PARTIAL = (1 << 0); // < partial - need to run through forward trie 297 static final int MATCH = (1 << 1); // < exact match - skip this one. 298 static final int SuppressInReverse = (1 << 0); 299 static final int AddToForward = (1 << 1); 300 Builder(Locale loc)301 public Builder(Locale loc) { 302 this(ULocale.forLocale(loc)); 303 } 304 /** 305 * Create SimpleFilteredBreakIteratorBuilder using given locale 306 * @param loc the locale to get filtered iterators 307 */ Builder(ULocale loc)308 public Builder(ULocale loc) { 309 ICUResourceBundle rb = ICUResourceBundle.getBundleInstance( 310 ICUData.ICU_BRKITR_BASE_NAME, loc, OpenType.LOCALE_ROOT); 311 312 ICUResourceBundle breaks = rb.findWithFallback("exceptions/SentenceBreak"); 313 314 if (breaks != null) { 315 for (int index = 0, size = breaks.getSize(); index < size; ++index) { 316 ICUResourceBundle b = (ICUResourceBundle) breaks.get(index); 317 String br = b.getString(); 318 filterSet.add(br); 319 } 320 } 321 } 322 323 /** 324 * Create SimpleFilteredBreakIteratorBuilder with no exception 325 */ Builder()326 public Builder() { 327 } 328 329 @Override suppressBreakAfter(CharSequence str)330 public boolean suppressBreakAfter(CharSequence str) { 331 return filterSet.add(str); 332 } 333 334 @Override unsuppressBreakAfter(CharSequence str)335 public boolean unsuppressBreakAfter(CharSequence str) { 336 return filterSet.remove(str); 337 } 338 339 @Override wrapIteratorWithFilter(BreakIterator adoptBreakIterator)340 public BreakIterator wrapIteratorWithFilter(BreakIterator adoptBreakIterator) { 341 if( filterSet.isEmpty() ) { 342 // Short circuit - nothing to except. 343 return adoptBreakIterator; 344 } 345 346 CharsTrieBuilder builder = new CharsTrieBuilder(); 347 CharsTrieBuilder builder2 = new CharsTrieBuilder(); 348 349 int revCount = 0; 350 int fwdCount = 0; 351 352 int subCount = filterSet.size(); 353 CharSequence[] ustrs = new CharSequence[subCount]; 354 int[] partials = new int[subCount]; 355 356 CharsTrie backwardsTrie = null; // i.e. ".srM" for Mrs. 357 CharsTrie forwardsPartialTrie = null; // Has ".a" for "a.M." 358 359 int i = 0; 360 for (CharSequence s : filterSet) { 361 ustrs[i] = s; // copy by value? 362 partials[i] = 0; // default: no partial 363 i++; 364 } 365 366 for (i = 0; i < subCount; i++) { 367 String thisStr = ustrs[i].toString(); // TODO: don't cast to String? 368 int nn = thisStr.indexOf('.'); // TODO: non-'.' abbreviations 369 if (nn > -1 && (nn + 1) != thisStr.length()) { 370 // is partial. 371 // is it unique? 372 int sameAs = -1; 373 for (int j = 0; j < subCount; j++) { 374 if (j == i) 375 continue; 376 if (thisStr.regionMatches(0, ustrs[j].toString() /* TODO */, 0, nn + 1)) { 377 if (partials[j] == 0) { // hasn't been processed yet 378 partials[j] = SuppressInReverse | AddToForward; 379 } else if ((partials[j] & SuppressInReverse) != 0) { 380 sameAs = j; // the other entry is already in the reverse table. 381 } 382 } 383 } 384 385 if ((sameAs == -1) && (partials[i] == 0)) { 386 StringBuilder prefix = new StringBuilder(thisStr.substring(0, nn + 1)); 387 // first one - add the prefix to the reverse table. 388 prefix.reverse(); 389 builder.add(prefix, PARTIAL); 390 revCount++; 391 partials[i] = SuppressInReverse | AddToForward; 392 } 393 } 394 } 395 396 for (i = 0; i < subCount; i++) { 397 final String thisStr = ustrs[i].toString(); // TODO 398 if (partials[i] == 0) { 399 StringBuilder reversed = new StringBuilder(thisStr).reverse(); 400 builder.add(reversed, MATCH); 401 revCount++; 402 } else { 403 // an optimization would be to only add the portion after the '.' 404 // for example, for "Ph.D." we store ".hP" in the reverse table. We could just store "D." in the 405 // forward, 406 // instead of "Ph.D." since we already know the "Ph." part is a match. 407 // would need the trie to be able to hold 0-length strings, though. 408 builder2.add(thisStr, MATCH); // forward 409 fwdCount++; 410 } 411 } 412 413 if (revCount > 0) { 414 backwardsTrie = builder.build(StringTrieBuilder.Option.FAST); 415 } 416 417 if (fwdCount > 0) { 418 forwardsPartialTrie = builder2.build(StringTrieBuilder.Option.FAST); 419 } 420 return new SimpleFilteredSentenceBreakIterator(adoptBreakIterator, forwardsPartialTrie, backwardsTrie); 421 } 422 } 423 } 424