1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.text.CharacterIterator; 12 import java.text.StringCharacterIterator; 13 import java.util.Locale; 14 15 import com.ibm.icu.util.ICUException; 16 import com.ibm.icu.util.ULocale; 17 18 // Java porting note: 19 // 20 // The ICU4C implementation contains dead code in many places. 21 // While porting the ICU4C linear search implementation, this dead code 22 // was not fully ported. The code blocks tagged by "// *** Boyer-Moore ***" 23 // are those dead code blocks, still available in ICU4C. 24 25 // The ICU4C implementation does not seem to handle UCharacterIterator pointing 26 // to a fragment of text properly. ICU4J uses CharacterIterator to navigate through 27 // the input text. We need to carefully review the code ported from ICU4C 28 // assuming the start index is 0. 29 30 // ICU4C implementation initializes pattern.CE and pattern.PCE. It looks like 31 // CE is no longer used, except in a few places checking CELength. It looks like this 32 // is a leftover from already-disabled Boyer-Moore search code. This Java implementation 33 // preserves the code, but we should clean this up later. 34 35 /** 36 * 37 * <tt>StringSearch</tt> is a {@link SearchIterator} that provides 38 * language-sensitive text searching based on the comparison rules defined 39 * in a {@link RuleBasedCollator} object. 40 * StringSearch ensures that language eccentricity can be 41 * handled, e.g. for the German collator, characters ß and SS will be matched 42 * if case is chosen to be ignored. 43 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm"> 44 * "ICU Collation Design Document"</a> for more information. 45 * <p> 46 * There are 2 match options for selection:<br> 47 * Let S' be the sub-string of a text string S between the offsets start and 48 * end [start, end]. 49 * <br> 50 * A pattern string P matches a text string S at the offsets [start, end] 51 * if 52 * <pre> 53 * option 1. Some canonical equivalent of P matches some canonical equivalent 54 * of S' 55 * option 2. P matches S' and if P starts or ends with a combining mark, 56 * there exists no non-ignorable combining mark before or after S? 57 * in S respectively. 58 * </pre> 59 * Option 2. is the default. 60 * <p> 61 * This search has APIs similar to that of other text iteration mechanisms 62 * such as the break iterators in {@link BreakIterator}. Using these 63 * APIs, it is easy to scan through text looking for all occurrences of 64 * a given pattern. This search iterator allows changing of direction by 65 * calling a {@link #reset} followed by a {@link #next} or {@link #previous}. 66 * Though a direction change can occur without calling {@link #reset} first, 67 * this operation comes with some speed penalty. 68 * Match results in the forward direction will match the result matches in 69 * the backwards direction in the reverse order 70 * <p> 71 * {@link SearchIterator} provides APIs to specify the starting position 72 * within the text string to be searched, e.g. {@link SearchIterator#setIndex setIndex}, 73 * {@link SearchIterator#preceding preceding} and {@link SearchIterator#following following}. 74 * Since the starting position will be set as it is specified, please take note that 75 * there are some danger points at which the search may render incorrect 76 * results: 77 * <ul> 78 * <li> In the midst of a substring that requires normalization. 79 * <li> If the following match is to be found, the position should not be the 80 * second character which requires swapping with the preceding 81 * character. Vice versa, if the preceding match is to be found, the 82 * position to search from should not be the first character which 83 * requires swapping with the next character. E.g certain Thai and 84 * Lao characters require swapping. 85 * <li> If a following pattern match is to be found, any position within a 86 * contracting sequence except the first will fail. Vice versa if a 87 * preceding pattern match is to be found, an invalid starting point 88 * would be any character within a contracting sequence except the last. 89 * </ul> 90 * <p> 91 * A {@link BreakIterator} can be used if only matches at logical breaks are desired. 92 * Using a {@link BreakIterator} will only give you results that exactly matches the 93 * boundaries given by the {@link BreakIterator}. For instance the pattern "e" will 94 * not be found in the string "\u00e9" if a character break iterator is used. 95 * <p> 96 * Options are provided to handle overlapping matches. 97 * E.g. In English, overlapping matches produces the result 0 and 2 98 * for the pattern "abab" in the text "ababab", where mutually 99 * exclusive matches only produces the result of 0. 100 * <p> 101 * Options are also provided to implement "asymmetric search" as described in 102 * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search"> 103 * UTS #10 Unicode Collation Algorithm</a>, specifically the ElementComparisonType 104 * values. 105 * <p> 106 * Though collator attributes will be taken into consideration while 107 * performing matches, there are no APIs here for setting and getting the 108 * attributes. These attributes can be set by getting the collator 109 * from {@link #getCollator} and using the APIs in {@link RuleBasedCollator}. 110 * Lastly to update <tt>StringSearch</tt> to the new collator attributes, 111 * {@link #reset} has to be called. 112 * <p> 113 * Restriction: <br> 114 * Currently there are no composite characters that consists of a 115 * character with combining class > 0 before a character with combining 116 * class == 0. However, if such a character exists in the future, 117 * <tt>StringSearch</tt> does not guarantee the results for option 1. 118 * <p> 119 * Consult the {@link SearchIterator} documentation for information on 120 * and examples of how to use instances of this class to implement text 121 * searching. 122 * <p> 123 * Note, <tt>StringSearch</tt> is not to be subclassed. 124 * </p> 125 * @see SearchIterator 126 * @see RuleBasedCollator 127 * @author Laura Werner, synwee 128 * @stable ICU 2.0 129 */ 130 // internal notes: all methods do not guarantee the correct status of the 131 // characteriterator. the caller has to maintain the original index position 132 // if necessary. methods could change the index position as it deems fit 133 public final class StringSearch extends SearchIterator { 134 135 private Pattern pattern_; 136 private RuleBasedCollator collator_; 137 138 // positions within the collation element iterator is used to determine 139 // if we are at the start of the text. 140 private CollationElementIterator textIter_; 141 private CollationPCE textProcessedIter_; 142 143 // utility collation element, used throughout program for temporary 144 // iteration. 145 private CollationElementIterator utilIter_; 146 147 private Normalizer2 nfd_; 148 149 private int strength_; 150 int ceMask_; 151 int variableTop_; 152 153 private boolean toShift_; 154 155 // *** Boyer-Moore *** 156 // private char[] canonicalPrefixAccents_; 157 // private char[] canonicalSuffixAccents_; 158 159 /** 160 * Initializes the iterator to use the language-specific rules defined in 161 * the argument collator to search for argument pattern in the argument 162 * target text. The argument <code>breakiter</code> is used to define logical matches. 163 * See super class documentation for more details on the use of the target 164 * text and {@link BreakIterator}. 165 * @param pattern text to look for. 166 * @param target target text to search for pattern. 167 * @param collator {@link RuleBasedCollator} that defines the language rules 168 * @param breakiter A {@link BreakIterator} that is used to determine the 169 * boundaries of a logical match. This argument can be null. 170 * @throws IllegalArgumentException thrown when argument target is null, 171 * or of length 0 172 * @see BreakIterator 173 * @see RuleBasedCollator 174 * @stable ICU 2.0 175 */ StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator, BreakIterator breakiter)176 public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator, 177 BreakIterator breakiter) { 178 179 // This implementation is ported from ICU4C usearch_open() 180 181 super(target, breakiter); 182 183 // string search does not really work when numeric collation is turned on 184 if (collator.getNumericCollation()) { 185 throw new UnsupportedOperationException("Numeric collation is not supported by StringSearch"); 186 } 187 188 collator_ = collator; 189 strength_ = collator.getStrength(); 190 ceMask_ = getMask(strength_); 191 toShift_ = collator.isAlternateHandlingShifted(); 192 variableTop_ = collator.getVariableTop(); 193 194 nfd_ = Normalizer2.getNFDInstance(); 195 196 pattern_ = new Pattern(pattern); 197 198 search_.setMatchedLength(0); 199 search_.matchedIndex_ = DONE; 200 201 utilIter_ = null; 202 textIter_ = new CollationElementIterator(target, collator); 203 204 textProcessedIter_ = null; 205 206 // This is done by super class constructor 207 /* 208 search_.isOverlap_ = false; 209 search_.isCanonicalMatch_ = false; 210 search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON; 211 search_.isForwardSearching_ = true; 212 search_.reset_ = true; 213 */ 214 ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE); 215 search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale); 216 search_.internalBreakIter_.setText((CharacterIterator)target.clone()); // We need to create a clone 217 218 initialize(); 219 } 220 221 /** 222 * Initializes the iterator to use the language-specific rules defined in 223 * the argument collator to search for argument pattern in the argument 224 * target text. No {@link BreakIterator}s are set to test for logical matches. 225 * @param pattern text to look for. 226 * @param target target text to search for pattern. 227 * @param collator {@link RuleBasedCollator} that defines the language rules 228 * @throws IllegalArgumentException thrown when argument target is null, 229 * or of length 0 230 * @see RuleBasedCollator 231 * @stable ICU 2.0 232 */ StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator)233 public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator) { 234 this(pattern, target, collator, null); 235 } 236 237 /** 238 * Initializes the iterator to use the language-specific rules and 239 * break iterator rules defined in the argument locale to search for 240 * argument pattern in the argument target text. 241 * @param pattern text to look for. 242 * @param target target text to search for pattern. 243 * @param locale locale to use for language and break iterator rules 244 * @throws IllegalArgumentException thrown when argument target is null, 245 * or of length 0. ClassCastException thrown if the collator for 246 * the specified locale is not a RuleBasedCollator. 247 * @stable ICU 2.0 248 */ StringSearch(String pattern, CharacterIterator target, Locale locale)249 public StringSearch(String pattern, CharacterIterator target, Locale locale) { 250 this(pattern, target, ULocale.forLocale(locale)); 251 } 252 253 /** 254 * Initializes the iterator to use the language-specific rules and 255 * break iterator rules defined in the argument locale to search for 256 * argument pattern in the argument target text. 257 * See super class documentation for more details on the use of the target 258 * text and {@link BreakIterator}. 259 * @param pattern text to look for. 260 * @param target target text to search for pattern. 261 * @param locale locale to use for language and break iterator rules 262 * @throws IllegalArgumentException thrown when argument target is null, 263 * or of length 0. ClassCastException thrown if the collator for 264 * the specified locale is not a RuleBasedCollator. 265 * @see BreakIterator 266 * @see RuleBasedCollator 267 * @see SearchIterator 268 * @stable ICU 3.2 269 */ StringSearch(String pattern, CharacterIterator target, ULocale locale)270 public StringSearch(String pattern, CharacterIterator target, ULocale locale) { 271 this(pattern, target, (RuleBasedCollator) Collator.getInstance(locale), null); 272 } 273 274 /** 275 * Initializes the iterator to use the language-specific rules and 276 * break iterator rules defined in the default locale to search for 277 * argument pattern in the argument target text. 278 * @param pattern text to look for. 279 * @param target target text to search for pattern. 280 * @throws IllegalArgumentException thrown when argument target is null, 281 * or of length 0. ClassCastException thrown if the collator for 282 * the default locale is not a RuleBasedCollator. 283 * @stable ICU 2.0 284 */ StringSearch(String pattern, String target)285 public StringSearch(String pattern, String target) { 286 this(pattern, new StringCharacterIterator(target), 287 (RuleBasedCollator) Collator.getInstance(), null); 288 } 289 290 /** 291 * Gets the {@link RuleBasedCollator} used for the language rules. 292 * <p> 293 * Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any 294 * changes to the {@link RuleBasedCollator} result should follow with a call to 295 * either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct 296 * search behavior. 297 * </p> 298 * @return {@link RuleBasedCollator} used by this <tt>StringSearch</tt> 299 * @see RuleBasedCollator 300 * @see #setCollator 301 * @stable ICU 2.0 302 */ getCollator()303 public RuleBasedCollator getCollator() { 304 return collator_; 305 } 306 307 /** 308 * Sets the {@link RuleBasedCollator} to be used for language-specific searching. 309 * <p> 310 * The iterator's position will not be changed by this method. 311 * @param collator to use for this <tt>StringSearch</tt> 312 * @throws IllegalArgumentException thrown when collator is null 313 * @see #getCollator 314 * @stable ICU 2.0 315 */ setCollator(RuleBasedCollator collator)316 public void setCollator(RuleBasedCollator collator) { 317 if (collator == null) { 318 throw new IllegalArgumentException("Collator can not be null"); 319 } 320 collator_ = collator; 321 ceMask_ = getMask(collator_.getStrength()); 322 323 ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE); 324 search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale); 325 search_.internalBreakIter_.setText((CharacterIterator)search_.text().clone()); // We need to create a clone 326 327 toShift_ = collator.isAlternateHandlingShifted(); 328 variableTop_ = collator.getVariableTop(); 329 textIter_ = new CollationElementIterator(pattern_.text_, collator); 330 utilIter_ = new CollationElementIterator(pattern_.text_, collator); 331 332 // initialize() _after_ setting the iterators for the new collator. 333 initialize(); 334 } 335 336 /** 337 * Returns the pattern for which <tt>StringSearch</tt> is searching for. 338 * @return the pattern searched for 339 * @stable ICU 2.0 340 */ getPattern()341 public String getPattern() { 342 return pattern_.text_; 343 } 344 345 /** 346 * Set the pattern to search for. 347 * The iterator's position will not be changed by this method. 348 * @param pattern for searching 349 * @see #getPattern 350 * @exception IllegalArgumentException thrown if pattern is null or of 351 * length 0 352 * @stable ICU 2.0 353 */ setPattern(String pattern)354 public void setPattern(String pattern) { 355 if (pattern == null || pattern.length() <= 0) { 356 throw new IllegalArgumentException( 357 "Pattern to search for can not be null or of length 0"); 358 } 359 pattern_.text_ = pattern; 360 initialize(); 361 } 362 363 /** 364 * Determines whether canonical matches (option 1, as described in the 365 * class documentation) is set. 366 * See setCanonical(boolean) for more information. 367 * @see #setCanonical 368 * @return true if canonical matches is set, false otherwise 369 * @stable ICU 2.8 370 */ 371 //TODO: hoist this to SearchIterator isCanonical()372 public boolean isCanonical() { 373 return search_.isCanonicalMatch_; 374 } 375 376 /** 377 * Set the canonical match mode. See class documentation for details. 378 * The default setting for this property is false. 379 * @param allowCanonical flag indicator if canonical matches are allowed 380 * @see #isCanonical 381 * @stable ICU 2.8 382 */ 383 //TODO: hoist this to SearchIterator setCanonical(boolean allowCanonical)384 public void setCanonical(boolean allowCanonical) { 385 search_.isCanonicalMatch_ = allowCanonical; 386 } 387 388 /** 389 * {@inheritDoc} 390 * @stable ICU 2.8 391 */ 392 @Override setTarget(CharacterIterator text)393 public void setTarget(CharacterIterator text) { 394 super.setTarget(text); 395 textIter_.setText(text); 396 } 397 398 /** 399 * {@inheritDoc} 400 * @stable ICU 2.8 401 */ 402 @Override getIndex()403 public int getIndex() { 404 int result = textIter_.getOffset(); 405 if (isOutOfBounds(search_.beginIndex(), search_.endIndex(), result)) { 406 return DONE; 407 } 408 return result; 409 } 410 411 /** 412 * {@inheritDoc} 413 * @stable ICU 2.8 414 */ 415 @Override setIndex(int position)416 public void setIndex(int position) { 417 // Java porting note: This method is equivalent to setOffset() in ICU4C. 418 // ICU4C SearchIterator::setOffset() is a pure virtual method, while 419 // ICU4J SearchIterator.setIndex() is not abstract method. 420 421 super.setIndex(position); 422 textIter_.setOffset(position); 423 } 424 425 /** 426 * {@inheritDoc} 427 * @stable ICU 2.8 428 */ 429 @Override reset()430 public void reset() { 431 // reset is setting the attributes that are already in 432 // string search, hence all attributes in the collator should 433 // be retrieved without any problems 434 435 boolean sameCollAttribute = true; 436 int ceMask; 437 boolean shift; 438 int varTop; 439 440 // **** hack to deal w/ how processed CEs encode quaternary **** 441 int newStrength = collator_.getStrength(); 442 if ((strength_ < Collator.QUATERNARY && newStrength >= Collator.QUATERNARY) 443 || (strength_ >= Collator.QUATERNARY && newStrength < Collator.QUATERNARY)) { 444 sameCollAttribute = false; 445 } 446 447 strength_ = collator_.getStrength(); 448 ceMask = getMask(strength_); 449 if (ceMask_ != ceMask) { 450 ceMask_ = ceMask; 451 sameCollAttribute = false; 452 } 453 454 shift = collator_.isAlternateHandlingShifted(); 455 if (toShift_ != shift) { 456 toShift_ = shift; 457 sameCollAttribute = false; 458 } 459 460 varTop = collator_.getVariableTop(); 461 if (variableTop_ != varTop) { 462 variableTop_ = varTop; 463 sameCollAttribute = false; 464 } 465 466 if (!sameCollAttribute) { 467 initialize(); 468 } 469 470 textIter_.setText(search_.text()); 471 472 search_.setMatchedLength(0); 473 search_.matchedIndex_ = DONE; 474 search_.isOverlap_ = false; 475 search_.isCanonicalMatch_ = false; 476 search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON; 477 search_.isForwardSearching_ = true; 478 search_.reset_ = true; 479 } 480 481 /** 482 * {@inheritDoc} 483 * @stable ICU 2.8 484 */ 485 @Override handleNext(int position)486 protected int handleNext(int position) { 487 if (pattern_.CELength_ == 0) { 488 search_.matchedIndex_ = search_.matchedIndex_ == DONE ? 489 getIndex() : search_.matchedIndex_ + 1; 490 search_.setMatchedLength(0); 491 textIter_.setOffset(search_.matchedIndex_); 492 if (search_.matchedIndex_ == search_.endIndex()) { 493 search_.matchedIndex_ = DONE; 494 } 495 } else { 496 if (search_.matchedLength() <= 0) { 497 // the flipping direction issue has already been handled 498 // in next() 499 // for boundary check purposes. this will ensure that the 500 // next match will not preceed the current offset 501 // note search_.matchedIndex_ will always be set to something 502 // in the code 503 search_.matchedIndex_ = position - 1; 504 } 505 506 textIter_.setOffset(position); 507 508 // ICU4C comment: 509 // if strsrch_->breakIter is always the same as m_breakiterator_ 510 // then we don't need to check the match boundaries here because 511 // usearch_handleNextXXX will already have done it. 512 if (search_.isCanonicalMatch_) { 513 // *could* actually use exact here 'cause no extra accents allowed... 514 handleNextCanonical(); 515 } else { 516 handleNextExact(); 517 } 518 519 if (search_.matchedIndex_ == DONE) { 520 textIter_.setOffset(search_.endIndex()); 521 } else { 522 textIter_.setOffset(search_.matchedIndex_); 523 } 524 525 return search_.matchedIndex_; 526 } 527 528 return DONE; 529 } 530 531 /** 532 * {@inheritDoc} 533 * @stable ICU 2.8 534 */ 535 @Override handlePrevious(int position)536 protected int handlePrevious(int position) { 537 if (pattern_.CELength_ == 0) { 538 search_.matchedIndex_ = 539 search_.matchedIndex_ == DONE ? getIndex() : search_.matchedIndex_; 540 if (search_.matchedIndex_ == search_.beginIndex()) { 541 setMatchNotFound(); 542 } else { 543 search_.matchedIndex_--; 544 textIter_.setOffset(search_.matchedIndex_); 545 search_.setMatchedLength(0); 546 } 547 } else { 548 textIter_.setOffset(position); 549 550 if (search_.isCanonicalMatch_) { 551 // *could* use exact match here since extra accents *not* allowed! 552 handlePreviousCanonical(); 553 } else { 554 handlePreviousExact(); 555 } 556 } 557 558 return search_.matchedIndex_; 559 } 560 561 // ------------------ Internal implementation code --------------------------- 562 563 private static final int INITIAL_ARRAY_SIZE_ = 256; 564 565 // *** Boyer-Moore *** 566 // private static final Normalizer2Impl nfcImpl_ = Norm2AllModes.getNFCInstance().impl; 567 // private static final int LAST_BYTE_MASK_ = 0xff; 568 // private static final int SECOND_LAST_BYTE_SHIFT_ = 8; 569 570 private static final int PRIMARYORDERMASK = 0xffff0000; 571 private static final int SECONDARYORDERMASK = 0x0000ff00; 572 private static final int TERTIARYORDERMASK = 0x000000ff; 573 574 /** 575 * Getting the mask for collation strength 576 * @param strength collation strength 577 * @return collation element mask 578 */ getMask(int strength)579 private static int getMask(int strength) { 580 switch (strength) { 581 case Collator.PRIMARY: 582 return PRIMARYORDERMASK; 583 case Collator.SECONDARY: 584 return SECONDARYORDERMASK | PRIMARYORDERMASK; 585 default: 586 return TERTIARYORDERMASK | SECONDARYORDERMASK | PRIMARYORDERMASK; 587 } 588 } 589 590 591 // *** Boyer-Moore *** 592 /* 593 private final char getFCD(String str, int offset) { 594 char ch = str.charAt(offset); 595 if (ch < 0x180) { 596 return (char) nfcImpl_.getFCD16FromBelow180(ch); 597 } else if (nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) { 598 if (!Character.isHighSurrogate(ch)) { 599 return (char) nfcImpl_.getFCD16FromNormData(ch); 600 } else { 601 char c2; 602 if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) { 603 return (char) nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2)); 604 } 605 } 606 } 607 return 0; 608 } 609 610 private final char getFCD(int c) { 611 return (char)nfcImpl_.getFCD16(c); 612 } 613 */ 614 615 /** 616 * Getting the modified collation elements taking into account the collation 617 * attributes. 618 * 619 * @param sourcece 620 * @return the modified collation element 621 */ getCE(int sourcece)622 private int getCE(int sourcece) { 623 // note for tertiary we can't use the collator->tertiaryMask, that 624 // is a preprocessed mask that takes into account case options. since 625 // we are only concerned with exact matches, we don't need that. 626 sourcece &= ceMask_; 627 628 if (toShift_) { 629 // alternate handling here, since only the 16 most significant digits 630 // is only used, we can safely do a compare without masking 631 // if the ce is a variable, we mask and get only the primary values 632 // no shifting to quartenary is required since all primary values 633 // less than variabletop will need to be masked off anyway. 634 if (variableTop_ > sourcece) { 635 if (strength_ >= Collator.QUATERNARY) { 636 sourcece &= PRIMARYORDERMASK; 637 } else { 638 sourcece = CollationElementIterator.IGNORABLE; 639 } 640 } 641 } else if (strength_ >= Collator.QUATERNARY && sourcece == CollationElementIterator.IGNORABLE) { 642 sourcece = 0xFFFF; 643 } 644 645 return sourcece; 646 } 647 648 /** 649 * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp. 650 * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should 651 * implement this in Pattern class. 652 * 653 * @param destination target array 654 * @param offset destination offset to add value 655 * @param destinationlength target array size 656 * @param value to be added 657 * @param increments incremental size expected 658 * @return new destination array, destination if there was no new allocation 659 */ addToIntArray(int[] destination, int offset, int destinationlength, int value, int increments)660 private static int[] addToIntArray(int[] destination, int offset, int destinationlength, 661 int value, int increments) { 662 int newlength = destinationlength; 663 if (offset + 1 == newlength) { 664 newlength += increments; 665 int temp[] = new int[newlength]; 666 System.arraycopy(destination, 0, temp, 0, offset); 667 destination = temp; 668 } 669 destination[offset] = value; 670 return destination; 671 } 672 673 /** 674 * Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp. 675 * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should 676 * implement this in Pattern class. 677 * 678 * @param destination target array 679 * @param offset destination offset to add value 680 * @param destinationlength target array size 681 * @param value to be added 682 * @param increments incremental size expected 683 * @return new destination array, destination if there was no new allocation 684 */ addToLongArray(long[] destination, int offset, int destinationlength, long value, int increments)685 private static long[] addToLongArray(long[] destination, int offset, int destinationlength, 686 long value, int increments) { 687 int newlength = destinationlength; 688 if (offset + 1 == newlength) { 689 newlength += increments; 690 long temp[] = new long[newlength]; 691 System.arraycopy(destination, 0, temp, 0, offset); 692 destination = temp; 693 } 694 destination[offset] = value; 695 return destination; 696 } 697 698 /** 699 * Initializing the ce table for a pattern. 700 * Stores non-ignorable collation keys. 701 * Table size will be estimated by the size of the pattern text. Table 702 * expansion will be perform as we go along. Adding 1 to ensure that the table 703 * size definitely increases. 704 * @return total number of expansions 705 */ 706 // TODO: We probably do not need Pattern CE table. initializePatternCETable()707 private int initializePatternCETable() { 708 int[] cetable = new int[INITIAL_ARRAY_SIZE_]; 709 int cetablesize = cetable.length; 710 int patternlength = pattern_.text_.length(); 711 CollationElementIterator coleiter = utilIter_; 712 713 if (coleiter == null) { 714 coleiter = new CollationElementIterator(pattern_.text_, collator_); 715 utilIter_ = coleiter; 716 } else { 717 coleiter.setText(pattern_.text_); 718 } 719 720 int offset = 0; 721 int result = 0; 722 int ce; 723 724 while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) { 725 int newce = getCE(ce); 726 if (newce != CollationElementIterator.IGNORABLE /* 0 */) { 727 int[] temp = addToIntArray(cetable, offset, cetablesize, newce, 728 patternlength - coleiter.getOffset() + 1); 729 offset++; 730 cetable = temp; 731 } 732 result += (coleiter.getMaxExpansion(ce) - 1); 733 } 734 735 cetable[offset] = 0; 736 pattern_.CE_ = cetable; 737 pattern_.CELength_ = offset; 738 739 return result; 740 } 741 742 /** 743 * Initializing the pce table for a pattern. 744 * Stores non-ignorable collation keys. 745 * Table size will be estimated by the size of the pattern text. Table 746 * expansion will be perform as we go along. Adding 1 to ensure that the table 747 * size definitely increases. 748 * @return total number of expansions 749 */ initializePatternPCETable()750 private int initializePatternPCETable() { 751 long[] pcetable = new long[INITIAL_ARRAY_SIZE_]; 752 int pcetablesize = pcetable.length; 753 int patternlength = pattern_.text_.length(); 754 CollationElementIterator coleiter = utilIter_; 755 756 if (coleiter == null) { 757 coleiter = new CollationElementIterator(pattern_.text_, collator_); 758 utilIter_ = coleiter; 759 } else { 760 coleiter.setText(pattern_.text_); 761 } 762 763 int offset = 0; 764 int result = 0; 765 long pce; 766 767 CollationPCE iter = new CollationPCE(coleiter); 768 769 // ** Should processed CEs be signed or unsigned? 770 // ** (the rest of the code in this file seems to play fast-and-loose with 771 // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 772 while ((pce = iter.nextProcessed(null)) != CollationPCE.PROCESSED_NULLORDER) { 773 long[] temp = addToLongArray(pcetable, offset, pcetablesize, pce, patternlength - coleiter.getOffset() + 1); 774 offset++; 775 pcetable = temp; 776 } 777 778 pcetable[offset] = 0; 779 pattern_.PCE_ = pcetable; 780 pattern_.PCELength_ = offset; 781 782 return result; 783 } 784 785 // TODO: This method only triggers initializePatternCETable(), which is probably no 786 // longer needed. initializePattern()787 private int initializePattern() { 788 // Since the strength is primary, accents are ignored in the pattern. 789 790 // *** Boyer-Moore *** 791 /* 792 if (strength_ == Collator.PRIMARY) { 793 pattern_.hasPrefixAccents_ = false; 794 pattern_.hasSuffixAccents_ = false; 795 } else { 796 pattern_.hasPrefixAccents_ = (getFCD(pattern_.text_, 0) >>> SECOND_LAST_BYTE_SHIFT_) != 0; 797 pattern_.hasSuffixAccents_ = (getFCD(pattern_.text_.codePointBefore(pattern_.text_.length())) & LAST_BYTE_MASK_) != 0; 798 } 799 */ 800 801 pattern_.PCE_ = null; 802 803 // since intializePattern is an internal method status is a success. 804 return initializePatternCETable(); 805 } 806 807 // *** Boyer-Moore *** 808 /* 809 private final void setShiftTable(char shift[], 810 char backshift[], 811 int cetable[], int cesize, 812 int expansionsize, 813 int defaultforward, 814 int defaultbackward) { 815 // No implementation 816 } 817 */ 818 819 // TODO: This method only triggers initializePattern(), which is probably no 820 // longer needed. initialize()821 private void initialize() { 822 /* int expandlength = */ initializePattern(); 823 824 // *** Boyer-Moore *** 825 /* 826 if (pattern_.CELength_ > 0) { 827 int cesize = pattern_.CELength_; 828 int minlength = cesize > expandlength ? cesize - expandlength : 1; 829 pattern_.defaultShiftSize_ = minlength; 830 setShiftTable(pattern_.shift_, pattern_.backShift_, pattern_.CE_, cesize, 831 expandlength, minlength, minlength); 832 return; 833 } 834 return pattern_.defaultShiftSize_; 835 */ 836 } 837 838 /** 839 * @internal 840 * @deprecated This API is ICU internal only. 841 */ 842 @Deprecated setMatchNotFound()843 protected void setMatchNotFound() { 844 super.setMatchNotFound(); 845 // SearchIterator#setMatchNotFound() does following: 846 // search_.matchedIndex_ = DONE; 847 // search_.setMatchedLength(0); 848 if (search_.isForwardSearching_) { 849 textIter_.setOffset(search_.text().getEndIndex()); 850 } else { 851 textIter_.setOffset(0); 852 } 853 } 854 855 /** 856 * Checks if the offset runs out of the text string range 857 * @param textstart offset of the first character in the range 858 * @param textlimit limit offset of the text string range 859 * @param offset to test 860 * @return true if offset is out of bounds, false otherwise 861 */ isOutOfBounds(int textstart, int textlimit, int offset)862 private static final boolean isOutOfBounds(int textstart, int textlimit, int offset) { 863 return offset < textstart || offset > textlimit; 864 } 865 866 /** 867 * Checks for identical match 868 * @param start offset of possible match 869 * @param end offset of possible match 870 * @return TRUE if identical match is found 871 */ checkIdentical(int start, int end)872 private boolean checkIdentical(int start, int end) { 873 if (strength_ != Collator.IDENTICAL) { 874 return true; 875 } 876 // Note: We could use Normalizer::compare() or similar, but for short strings 877 // which may not be in FCD it might be faster to just NFD them. 878 String textstr = getString(targetText, start, end - start); 879 if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) { 880 textstr = Normalizer.decompose(textstr, false); 881 } 882 String patternstr = pattern_.text_; 883 if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) { 884 patternstr = Normalizer.decompose(patternstr, false); 885 } 886 return textstr.equals(patternstr); 887 } 888 initTextProcessedIter()889 private boolean initTextProcessedIter() { 890 if (textProcessedIter_ == null) { 891 textProcessedIter_ = new CollationPCE(textIter_); 892 } else { 893 textProcessedIter_.init(textIter_); 894 } 895 return true; 896 } 897 898 /* 899 * Find the next break boundary after startIndex. If the UStringSearch object 900 * has an external break iterator, use that. Otherwise use the internal character 901 * break iterator. 902 */ nextBoundaryAfter(int startIndex)903 private int nextBoundaryAfter(int startIndex) { 904 BreakIterator breakiterator = search_.breakIter(); 905 906 if (breakiterator == null) { 907 breakiterator = search_.internalBreakIter_; 908 } 909 910 if (breakiterator != null) { 911 return breakiterator.following(startIndex); 912 } 913 914 return startIndex; 915 } 916 917 /* 918 * Returns TRUE if index is on a break boundary. If the UStringSearch 919 * has an external break iterator, test using that, otherwise test 920 * using the internal character break iterator. 921 */ isBreakBoundary(int index)922 private boolean isBreakBoundary(int index) { 923 BreakIterator breakiterator = search_.breakIter(); 924 925 if (breakiterator == null) { 926 breakiterator = search_.internalBreakIter_; 927 } 928 929 return (breakiterator != null && breakiterator.isBoundary(index)); 930 } 931 932 933 // Java porting note: Followings are corresponding to UCompareCEsResult enum 934 private static final int CE_MATCH = -1; 935 private static final int CE_NO_MATCH = 0; 936 private static final int CE_SKIP_TARG = 1; 937 private static final int CE_SKIP_PATN = 2; 938 939 private static int CE_LEVEL2_BASE = 0x00000005; 940 private static int CE_LEVEL3_BASE = 0x00050000; 941 compareCE64s(long targCE, long patCE, ElementComparisonType compareType)942 private static int compareCE64s(long targCE, long patCE, ElementComparisonType compareType) { 943 if (targCE == patCE) { 944 return CE_MATCH; 945 } 946 if (compareType == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 947 return CE_NO_MATCH; 948 } 949 950 long targCEshifted = targCE >>> 32; 951 long patCEshifted = patCE >>> 32; 952 long mask; 953 954 mask = 0xFFFF0000L; 955 int targLev1 = (int)(targCEshifted & mask); 956 int patLev1 = (int)(patCEshifted & mask); 957 if (targLev1 != patLev1) { 958 if (targLev1 == 0) { 959 return CE_SKIP_TARG; 960 } 961 if (patLev1 == 0 962 && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) { 963 return CE_SKIP_PATN; 964 } 965 return CE_NO_MATCH; 966 } 967 968 mask = 0x0000FFFFL; 969 int targLev2 = (int)(targCEshifted & mask); 970 int patLev2 = (int)(patCEshifted & mask); 971 if (targLev2 != patLev2) { 972 if (targLev2 == 0) { 973 return CE_SKIP_TARG; 974 } 975 if (patLev2 == 0 976 && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) { 977 return CE_SKIP_PATN; 978 } 979 return (patLev2 == CE_LEVEL2_BASE || 980 (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD && 981 targLev2 == CE_LEVEL2_BASE)) ? CE_MATCH : CE_NO_MATCH; 982 } 983 984 mask = 0xFFFF0000L; 985 int targLev3 = (int)(targCE & mask); 986 int patLev3 = (int)(patCE & mask); 987 if (targLev3 != patLev3) { 988 return (patLev3 == CE_LEVEL3_BASE || 989 (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD && 990 targLev3 == CE_LEVEL3_BASE) )? CE_MATCH: CE_NO_MATCH; 991 } 992 993 return CE_MATCH; 994 } 995 996 /** 997 * An object used for receiving matched index in search() and 998 * searchBackwards(). 999 */ 1000 private static class Match { 1001 int start_ = -1; 1002 int limit_ = -1; 1003 } 1004 search(int startIdx, Match m)1005 private boolean search(int startIdx, Match m) { 1006 // Input parameter sanity check. 1007 if (pattern_.CELength_ == 0 1008 || startIdx < search_.beginIndex() 1009 || startIdx > search_.endIndex()) { 1010 throw new IllegalArgumentException("search(" + startIdx + ", m) - expected position to be between " + 1011 search_.beginIndex() + " and " + search_.endIndex()); 1012 } 1013 1014 if (pattern_.PCE_ == null) { 1015 initializePatternPCETable(); 1016 } 1017 1018 textIter_.setOffset(startIdx); 1019 CEBuffer ceb = new CEBuffer(this); 1020 1021 int targetIx = 0; 1022 CEI targetCEI = null; 1023 int patIx; 1024 boolean found; 1025 1026 int mStart = -1; 1027 int mLimit = -1; 1028 int minLimit; 1029 int maxLimit; 1030 1031 // Outer loop moves over match starting positions in the 1032 // target CE space. 1033 // Here we see the target as a sequence of collation elements, resulting from the following: 1034 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 1035 // (for example, digraphs such as IJ may be broken into two characters). 1036 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 1037 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 1038 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 1039 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 1040 // then the CE is deleted, so the following code sees only CEs that are relevant. 1041 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 1042 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 1043 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 1044 for (targetIx = 0; ; targetIx++) { 1045 found = true; 1046 // Inner loop checks for a match beginning at each 1047 // position from the outer loop. 1048 int targetIxOffset = 0; 1049 long patCE = 0; 1050 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 1051 // (compared to the last CE fetched for the previous targetIx value) as we need to go 1052 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 1053 CEI firstCEI = ceb.get(targetIx); 1054 if (firstCEI == null) { 1055 throw new ICUException("CEBuffer.get(" + targetIx + ") returned null."); 1056 } 1057 1058 for (patIx = 0; patIx < pattern_.PCELength_; patIx++) { 1059 patCE = pattern_.PCE_[patIx]; 1060 targetCEI = ceb.get(targetIx + patIx + targetIxOffset); 1061 // Compare CE from target string with CE from the pattern. 1062 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 1063 // which will fail the compare, below. 1064 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_); 1065 if (ceMatch == CE_NO_MATCH) { 1066 found = false; 1067 break; 1068 } else if (ceMatch > CE_NO_MATCH) { 1069 if (ceMatch == CE_SKIP_TARG) { 1070 // redo with same patCE, next targCE 1071 patIx--; 1072 targetIxOffset++; 1073 } else { // ceMatch == CE_SKIP_PATN 1074 // redo with same targCE, next patCE 1075 targetIxOffset--; 1076 } 1077 } 1078 } 1079 targetIxOffset += pattern_.PCELength_; // this is now the offset in target CE space to end of the match so far 1080 1081 if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) { 1082 // No match at this targetIx. Try again at the next. 1083 continue; 1084 } 1085 1086 if (!found) { 1087 // No match at all, we have run off the end of the target text. 1088 break; 1089 } 1090 1091 // We have found a match in CE space. 1092 // Now determine the bounds in string index space. 1093 // There still is a chance of match failure if the CE range not correspond to 1094 // an acceptable character range. 1095 // 1096 CEI lastCEI = ceb.get(targetIx + targetIxOffset -1); 1097 1098 mStart = firstCEI.lowIndex_; 1099 minLimit = lastCEI.lowIndex_; 1100 1101 // Look at the CE following the match. If it is UCOL_NULLORDER the match 1102 // extended to the end of input, and the match is good. 1103 1104 // Look at the high and low indices of the CE following the match. If 1105 // they are the same it means one of two things: 1106 // 1. The match extended to the last CE from the target text, which is OK, or 1107 // 2. The last CE that was part of the match is in an expansion that extends 1108 // to the first CE after the match. In this case, we reject the match. 1109 CEI nextCEI = null; 1110 if (search_.elementComparisonType_ == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 1111 nextCEI = ceb.get(targetIx + targetIxOffset); 1112 maxLimit = nextCEI.lowIndex_; 1113 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) { 1114 found = false; 1115 } 1116 } else { 1117 for (;; ++targetIxOffset) { 1118 nextCEI = ceb.get(targetIx + targetIxOffset); 1119 maxLimit = nextCEI.lowIndex_; 1120 // If we are at the end of the target too, match succeeds 1121 if (nextCEI.ce_ == CollationPCE.PROCESSED_NULLORDER) { 1122 break; 1123 } 1124 // As long as the next CE has primary weight of 0, 1125 // it is part of the last target element matched by the pattern; 1126 // make sure it can be part of a match with the last patCE 1127 if ((((nextCEI.ce_) >>> 32) & 0xFFFF0000L) == 0) { 1128 int ceMatch = compareCE64s(nextCEI.ce_, patCE, search_.elementComparisonType_); 1129 if (ceMatch == CE_NO_MATCH || ceMatch == CE_SKIP_PATN ) { 1130 found = false; 1131 break; 1132 } 1133 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 1134 // target element, but it has non-zero primary weight => match fails 1135 } else if ( nextCEI.lowIndex_ == nextCEI.highIndex_ ) { 1136 found = false; 1137 break; 1138 // Else the target CE is not part of an expansion of the last matched element, match succeeds 1139 } else { 1140 break; 1141 } 1142 } 1143 } 1144 1145 // Check for the start of the match being within a combining sequence. 1146 // This can happen if the pattern itself begins with a combining char, and 1147 // the match found combining marks in the target text that were attached 1148 // to something else. 1149 // This type of match should be rejected for not completely consuming a 1150 // combining sequence. 1151 if (!isBreakBoundary(mStart)) { 1152 found = false; 1153 } 1154 1155 // Check for the start of the match being within an Collation Element Expansion, 1156 // meaning that the first char of the match is only partially matched. 1157 // With expansions, the first CE will report the index of the source 1158 // character, and all subsequent (expansions) CEs will report the source index of the 1159 // _following_ character. 1160 int secondIx = firstCEI.highIndex_; 1161 if (mStart == secondIx) { 1162 found = false; 1163 } 1164 1165 // Allow matches to end in the middle of a grapheme cluster if the following 1166 // conditions are met; this is needed to make prefix search work properly in 1167 // Indic, see #11750 1168 // * the default breakIter is being used 1169 // * the next collation element after this combining sequence 1170 // - has non-zero primary weight 1171 // - corresponds to a separate character following the one at end of the current match 1172 // (the second of these conditions, and perhaps both, may be redundant given the 1173 // subsequent check for normalization boundary; however they are likely much faster 1174 // tests in any case) 1175 // * the match limit is a normalization boundary 1176 boolean allowMidclusterMatch = 1177 breakIterator == null && 1178 (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 && 1179 maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit && 1180 (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) || 1181 nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit))); 1182 1183 // If those conditions are met, then: 1184 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 1185 // the match limit may be backed off to a previous break boundary. This handles 1186 // cases in which mLimit includes target characters that are ignorable with current 1187 // settings (such as space) and which extend beyond the pattern match. 1188 // * do NOT require that end of the combining sequence not extend beyond the match in CE space 1189 // * do NOT require that match limit be on a breakIter boundary 1190 1191 // Advance the match end position to the first acceptable match boundary. 1192 // This advances the index over any combining characters. 1193 mLimit = maxLimit; 1194 if (minLimit < maxLimit) { 1195 // When the last CE's low index is same with its high index, the CE is likely 1196 // a part of expansion. In this case, the index is located just after the 1197 // character corresponding to the CEs compared above. If the index is right 1198 // at the break boundary, move the position to the next boundary will result 1199 // incorrect match length when there are ignorable characters exist between 1200 // the position and the next character produces CE(s). See ticket#8482. 1201 if (minLimit == lastCEI.highIndex_ && isBreakBoundary(minLimit)) { 1202 mLimit = minLimit; 1203 } else { 1204 int nba = nextBoundaryAfter(minLimit); 1205 // Note that we can have nba < maxLimit && nba >= minLImit, in which 1206 // case we want to set mLimit to nba regardless of allowMidclusterMatch 1207 // (i.e. we back off mLimit to the previous breakIterator boundary). 1208 if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) { 1209 mLimit = nba; 1210 } 1211 } 1212 } 1213 1214 if (!allowMidclusterMatch) { 1215 // If advancing to the end of a combining sequence in character indexing space 1216 // advanced us beyond the end of the match in CE space, reject this match. 1217 if (mLimit > maxLimit) { 1218 found = false; 1219 } 1220 1221 if (!isBreakBoundary(mLimit)) { 1222 found = false; 1223 } 1224 } 1225 1226 if (!checkIdentical(mStart, mLimit)) { 1227 found = false; 1228 } 1229 1230 if (found) { 1231 break; 1232 } 1233 } 1234 1235 // All Done. Store back the match bounds to the caller. 1236 // 1237 if (found == false) { 1238 mLimit = -1; 1239 mStart = -1; 1240 } 1241 1242 if (m != null) { 1243 m.start_ = mStart; 1244 m.limit_ = mLimit; 1245 } 1246 1247 return found; 1248 } 1249 codePointAt(CharacterIterator iter, int index)1250 private static int codePointAt(CharacterIterator iter, int index) { 1251 int currentIterIndex = iter.getIndex(); 1252 char codeUnit = iter.setIndex(index); 1253 int cp = codeUnit; 1254 if (Character.isHighSurrogate(codeUnit)) { 1255 char nextUnit = iter.next(); 1256 if (Character.isLowSurrogate(nextUnit)) { 1257 cp = Character.toCodePoint(codeUnit, nextUnit); 1258 } 1259 } 1260 iter.setIndex(currentIterIndex); // restore iter position 1261 return cp; 1262 } 1263 codePointBefore(CharacterIterator iter, int index)1264 private static int codePointBefore(CharacterIterator iter, int index) { 1265 int currentIterIndex = iter.getIndex(); 1266 iter.setIndex(index); 1267 char codeUnit = iter.previous(); 1268 int cp = codeUnit; 1269 if (Character.isLowSurrogate(codeUnit)) { 1270 char prevUnit = iter.previous(); 1271 if (Character.isHighSurrogate(prevUnit)) { 1272 cp = Character.toCodePoint(prevUnit, codeUnit); 1273 } 1274 } 1275 iter.setIndex(currentIterIndex); // restore iter position 1276 return cp; 1277 } 1278 searchBackwards(int startIdx, Match m)1279 private boolean searchBackwards(int startIdx, Match m) { 1280 //ICU4C_TODO comment: reject search patterns beginning with a combining char. 1281 1282 // Input parameter sanity check. 1283 if (pattern_.CELength_ == 0 1284 || startIdx < search_.beginIndex() 1285 || startIdx > search_.endIndex()) { 1286 throw new IllegalArgumentException("searchBackwards(" + startIdx + ", m) - expected position to be between " + 1287 search_.beginIndex() + " and " + search_.endIndex()); 1288 } 1289 1290 if (pattern_.PCE_ == null) { 1291 initializePatternPCETable(); 1292 } 1293 1294 CEBuffer ceb = new CEBuffer(this); 1295 int targetIx = 0; 1296 1297 /* 1298 * Pre-load the buffer with the CE's for the grapheme 1299 * after our starting position so that we're sure that 1300 * we can look at the CE following the match when we 1301 * check the match boundaries. 1302 * 1303 * This will also pre-fetch the first CE that we'll 1304 * consider for the match. 1305 */ 1306 if (startIdx < search_.endIndex()) { 1307 BreakIterator bi = search_.internalBreakIter_; 1308 int next = bi.following(startIdx); 1309 1310 textIter_.setOffset(next); 1311 1312 for (targetIx = 0; ; targetIx++) { 1313 if (ceb.getPrevious(targetIx).lowIndex_ < startIdx) { 1314 break; 1315 } 1316 } 1317 } else { 1318 textIter_.setOffset(startIdx); 1319 } 1320 1321 CEI targetCEI = null; 1322 int patIx; 1323 boolean found; 1324 1325 int limitIx = targetIx; 1326 int mStart = -1; 1327 int mLimit = -1; 1328 int minLimit; 1329 int maxLimit; 1330 1331 // Outer loop moves over match starting positions in the 1332 // target CE space. 1333 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 1334 // But patIx is 0 at the beginning of the pattern and increases toward the end. 1335 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 1336 // and the beginning of the base text. 1337 for (targetIx = limitIx; ; targetIx++) { 1338 found = true; 1339 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 1340 // (compared to the last CE fetched for the previous targetIx value) as we need to go 1341 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 1342 CEI lastCEI = ceb.getPrevious(targetIx); 1343 if (lastCEI == null) { 1344 throw new ICUException("CEBuffer.getPrevious(" + targetIx + ") returned null."); 1345 } 1346 // Inner loop checks for a match beginning at each 1347 // position from the outer loop. 1348 int targetIxOffset = 0; 1349 for (patIx = pattern_.PCELength_ - 1; patIx >= 0; patIx--) { 1350 long patCE = pattern_.PCE_[patIx]; 1351 1352 targetCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 - patIx + targetIxOffset); 1353 // Compare CE from target string with CE from the pattern. 1354 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 1355 // which will fail the compare, below. 1356 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_); 1357 if (ceMatch == CE_NO_MATCH) { 1358 found = false; 1359 break; 1360 } else if (ceMatch > CE_NO_MATCH) { 1361 if (ceMatch == CE_SKIP_TARG) { 1362 // redo with same patCE, next targCE 1363 patIx++; 1364 targetIxOffset++; 1365 } else { // ceMatch == CE_SKIP_PATN 1366 // redo with same targCE, next patCE 1367 targetIxOffset--; 1368 } 1369 } 1370 } 1371 1372 if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) { 1373 // No match at this targetIx. Try again at the next. 1374 continue; 1375 } 1376 1377 if (!found) { 1378 // No match at all, we have run off the end of the target text. 1379 break; 1380 } 1381 1382 // We have found a match in CE space. 1383 // Now determine the bounds in string index space. 1384 // There still is a chance of match failure if the CE range not correspond to 1385 // an acceptable character range. 1386 // 1387 CEI firstCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 + targetIxOffset); 1388 mStart = firstCEI.lowIndex_; 1389 1390 // Check for the start of the match being within a combining sequence. 1391 // This can happen if the pattern itself begins with a combining char, and 1392 // the match found combining marks in the target text that were attached 1393 // to something else. 1394 // This type of match should be rejected for not completely consuming a 1395 // combining sequence. 1396 if (!isBreakBoundary(mStart)) { 1397 found = false; 1398 } 1399 1400 // Look at the high index of the first CE in the match. If it's the same as the 1401 // low index, the first CE in the match is in the middle of an expansion. 1402 if (mStart == firstCEI.highIndex_) { 1403 found = false; 1404 } 1405 1406 minLimit = lastCEI.lowIndex_; 1407 1408 if (targetIx > 0) { 1409 // Look at the CE following the match. If it is UCOL_NULLORDER the match 1410 // extended to the end of input, and the match is good. 1411 1412 // Look at the high and low indices of the CE following the match. If 1413 // they are the same it means one of two things: 1414 // 1. The match extended to the last CE from the target text, which is OK, or 1415 // 2. The last CE that was part of the match is in an expansion that extends 1416 // to the first CE after the match. In this case, we reject the match. 1417 CEI nextCEI = ceb.getPrevious(targetIx - 1); 1418 1419 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) { 1420 found = false; 1421 } 1422 1423 mLimit = maxLimit = nextCEI.lowIndex_; 1424 1425 // Allow matches to end in the middle of a grapheme cluster if the following 1426 // conditions are met; this is needed to make prefix search work properly in 1427 // Indic, see #11750 1428 // * the default breakIter is being used 1429 // * the next collation element after this combining sequence 1430 // - has non-zero primary weight 1431 // - corresponds to a separate character following the one at end of the current match 1432 // (the second of these conditions, and perhaps both, may be redundant given the 1433 // subsequent check for normalization boundary; however they are likely much faster 1434 // tests in any case) 1435 // * the match limit is a normalization boundary 1436 boolean allowMidclusterMatch = 1437 breakIterator == null && 1438 (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 && 1439 maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit && 1440 (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) || 1441 nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit))); 1442 1443 // If those conditions are met, then: 1444 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however 1445 // the match limit may be backed off to a previous break boundary. This handles 1446 // cases in which mLimit includes target characters that are ignorable with current 1447 // settings (such as space) and which extend beyond the pattern match. 1448 // * do NOT require that end of the combining sequence not extend beyond the match in CE space 1449 // * do NOT require that match limit be on a breakIter boundary 1450 1451 // Advance the match end position to the first acceptable match boundary. 1452 // This advances the index over any combining charcters. 1453 if (minLimit < maxLimit) { 1454 int nba = nextBoundaryAfter(minLimit); 1455 // Note that we can have nba < maxLimit && nba >= minLImit, in which 1456 // case we want to set mLimit to nba regardless of allowMidclusterMatch 1457 // (i.e. we back off mLimit to the previous breakIterator boundary). 1458 if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) { 1459 mLimit = nba; 1460 } 1461 } 1462 1463 if (!allowMidclusterMatch) { 1464 // If advancing to the end of a combining sequence in character indexing space 1465 // advanced us beyond the end of the match in CE space, reject this match. 1466 if (mLimit > maxLimit) { 1467 found = false; 1468 } 1469 1470 // Make sure the end of the match is on a break boundary 1471 if (!isBreakBoundary(mLimit)) { 1472 found = false; 1473 } 1474 } 1475 1476 } else { 1477 // No non-ignorable CEs after this point. 1478 // The maximum position is detected by boundary after 1479 // the last non-ignorable CE. Combining sequence 1480 // across the start index will be truncated. 1481 int nba = nextBoundaryAfter(minLimit); 1482 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 1483 } 1484 1485 if (!checkIdentical(mStart, mLimit)) { 1486 found = false; 1487 } 1488 1489 if (found) { 1490 break; 1491 } 1492 } 1493 1494 // All Done. Store back the match bounds to the caller. 1495 // 1496 if (found == false) { 1497 mLimit = -1; 1498 mStart = -1; 1499 } 1500 1501 if (m != null) { 1502 m.start_ = mStart; 1503 m.limit_ = mLimit; 1504 } 1505 1506 return found; 1507 } 1508 1509 // Java porting note: 1510 // 1511 // ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical() 1512 // for the linear search implementation. The differences are addressed in search(). 1513 // handleNextExact()1514 private boolean handleNextExact() { 1515 return handleNextCommonImpl(); 1516 } 1517 handleNextCanonical()1518 private boolean handleNextCanonical() { 1519 return handleNextCommonImpl(); 1520 } 1521 handleNextCommonImpl()1522 private boolean handleNextCommonImpl() { 1523 int textOffset = textIter_.getOffset(); 1524 Match match = new Match(); 1525 1526 if (search(textOffset, match)) { 1527 search_.matchedIndex_ = match.start_; 1528 search_.setMatchedLength(match.limit_ - match.start_); 1529 return true; 1530 } else { 1531 setMatchNotFound(); 1532 return false; 1533 } 1534 } 1535 1536 // Java porting note: 1537 // 1538 // ICU4C usearch_handlePreviousExact() is identical to usearch_handlePreviousCanonical() 1539 // for the linear search implementation. The differences are addressed in searchBackwards(). 1540 // handlePreviousExact()1541 private boolean handlePreviousExact() { 1542 return handlePreviousCommonImpl(); 1543 } 1544 handlePreviousCanonical()1545 private boolean handlePreviousCanonical() { 1546 return handlePreviousCommonImpl(); 1547 } 1548 handlePreviousCommonImpl()1549 private boolean handlePreviousCommonImpl() { 1550 int textOffset; 1551 1552 if (search_.isOverlap_) { 1553 if (search_.matchedIndex_ != DONE) { 1554 textOffset = search_.matchedIndex_ + search_.matchedLength() - 1; 1555 } else { 1556 // move the start position at the end of possible match 1557 initializePatternPCETable(); 1558 if (!initTextProcessedIter()) { 1559 setMatchNotFound(); 1560 return false; 1561 } 1562 for (int nPCEs = 0; nPCEs < pattern_.PCELength_ - 1; nPCEs++) { 1563 long pce = textProcessedIter_.nextProcessed(null); 1564 if (pce == CollationPCE.PROCESSED_NULLORDER) { 1565 // at the end of the text 1566 break; 1567 } 1568 } 1569 textOffset = textIter_.getOffset(); 1570 } 1571 } else { 1572 textOffset = textIter_.getOffset(); 1573 } 1574 1575 Match match = new Match(); 1576 if (searchBackwards(textOffset, match)) { 1577 search_.matchedIndex_ = match.start_; 1578 search_.setMatchedLength(match.limit_ - match.start_); 1579 return true; 1580 } else { 1581 setMatchNotFound(); 1582 return false; 1583 } 1584 } 1585 1586 /** 1587 * Gets a substring out of a CharacterIterator 1588 * 1589 * Java porting note: Not available in ICU4C 1590 * 1591 * @param text CharacterIterator 1592 * @param start start offset 1593 * @param length of substring 1594 * @return substring from text starting at start and length length 1595 */ getString(CharacterIterator text, int start, int length)1596 private static final String getString(CharacterIterator text, int start, int length) { 1597 StringBuilder result = new StringBuilder(length); 1598 int offset = text.getIndex(); 1599 text.setIndex(start); 1600 for (int i = 0; i < length; i++) { 1601 result.append(text.current()); 1602 text.next(); 1603 } 1604 text.setIndex(offset); 1605 return result.toString(); 1606 } 1607 1608 /** 1609 * Java port of ICU4C struct UPattern (usrchimp.h) 1610 */ 1611 private static final class Pattern { 1612 /** Pattern string */ 1613 String text_; 1614 1615 long[] PCE_; 1616 int PCELength_ = 0; 1617 1618 // TODO: We probably do not need CE_ / CELength_ 1619 @SuppressWarnings("unused") 1620 int[] CE_; 1621 int CELength_ = 0; 1622 1623 // *** Boyer-Moore *** 1624 // boolean hasPrefixAccents_ = false; 1625 // boolean hasSuffixAccents_ = false; 1626 // int defaultShiftSize_; 1627 // char[] shift_; 1628 // char[] backShift_; 1629 Pattern(String pattern)1630 protected Pattern(String pattern) { 1631 text_ = pattern; 1632 } 1633 } 1634 1635 /** 1636 * Java port of ICU4C UCollationPCE (usrchimp.h) 1637 */ 1638 private static class CollationPCE { 1639 public static final long PROCESSED_NULLORDER = -1; 1640 1641 private static final int DEFAULT_BUFFER_SIZE = 16; 1642 private static final int BUFFER_GROW = 8; 1643 1644 // Note: PRIMARYORDERMASK is also duplicated in StringSearch class 1645 private static final int PRIMARYORDERMASK = 0xffff0000; 1646 private static final int CONTINUATION_MARKER = 0xc0; 1647 1648 private PCEBuffer pceBuffer_ = new PCEBuffer(); 1649 private CollationElementIterator cei_; 1650 private int strength_; 1651 private boolean toShift_; 1652 private boolean isShifted_; 1653 private int variableTop_; 1654 CollationPCE(CollationElementIterator iter)1655 public CollationPCE(CollationElementIterator iter) { 1656 init(iter); 1657 } 1658 init(CollationElementIterator iter)1659 public void init(CollationElementIterator iter) { 1660 cei_ = iter; 1661 init(iter.getRuleBasedCollator()); 1662 } 1663 init(RuleBasedCollator coll)1664 private void init(RuleBasedCollator coll) { 1665 strength_ = coll.getStrength(); 1666 toShift_ = coll.isAlternateHandlingShifted(); 1667 isShifted_ = false; 1668 variableTop_ = coll.getVariableTop(); 1669 } 1670 1671 @SuppressWarnings("fallthrough") processCE(int ce)1672 private long processCE(int ce) { 1673 long primary = 0, secondary = 0, tertiary = 0, quaternary = 0; 1674 1675 // This is clean, but somewhat slow... 1676 // We could apply the mask to ce and then 1677 // just get all three orders... 1678 switch (strength_) { 1679 default: 1680 tertiary = CollationElementIterator.tertiaryOrder(ce); 1681 /* note fall-through */ 1682 1683 case Collator.SECONDARY: 1684 secondary = CollationElementIterator.secondaryOrder(ce); 1685 /* note fall-through */ 1686 1687 case Collator.PRIMARY: 1688 primary = CollationElementIterator.primaryOrder(ce); 1689 } 1690 1691 // **** This should probably handle continuations too. **** 1692 // **** That means that we need 24 bits for the primary **** 1693 // **** instead of the 16 that we're currently using. **** 1694 // **** So we can lay out the 64 bits as: 24.12.12.16. **** 1695 // **** Another complication with continuations is that **** 1696 // **** the *second* CE is marked as a continuation, so **** 1697 // **** we always have to peek ahead to know how long **** 1698 // **** the primary is... **** 1699 if ((toShift_ && variableTop_ > ce && primary != 0) || (isShifted_ && primary == 0)) { 1700 1701 if (primary == 0) { 1702 return CollationElementIterator.IGNORABLE; 1703 } 1704 1705 if (strength_ >= Collator.QUATERNARY) { 1706 quaternary = primary; 1707 } 1708 1709 primary = secondary = tertiary = 0; 1710 isShifted_ = true; 1711 } else { 1712 if (strength_ >= Collator.QUATERNARY) { 1713 quaternary = 0xFFFF; 1714 } 1715 1716 isShifted_ = false; 1717 } 1718 1719 return primary << 48 | secondary << 32 | tertiary << 16 | quaternary; 1720 } 1721 1722 /** 1723 * Get the processed ordering priority of the next collation element in the text. 1724 * A single character may contain more than one collation element. 1725 * 1726 * Note: This is equivalent to 1727 * UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 1728 * 1729 * @param range receiving the iterator index before/after fetching the CE. 1730 * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER 1731 * if an error has occurred or if the end of string has been reached 1732 */ nextProcessed(Range range)1733 public long nextProcessed(Range range) { 1734 long result = CollationElementIterator.IGNORABLE; 1735 int low = 0, high = 0; 1736 1737 pceBuffer_.reset(); 1738 1739 do { 1740 low = cei_.getOffset(); 1741 int ce = cei_.next(); 1742 high = cei_.getOffset(); 1743 1744 if (ce == CollationElementIterator.NULLORDER) { 1745 result = PROCESSED_NULLORDER; 1746 break; 1747 } 1748 1749 result = processCE(ce); 1750 } while (result == CollationElementIterator.IGNORABLE); 1751 1752 if (range != null) { 1753 range.ixLow_ = low; 1754 range.ixHigh_ = high; 1755 } 1756 1757 return result; 1758 } 1759 1760 /** 1761 * Get the processed ordering priority of the previous collation element in the text. 1762 * A single character may contain more than one collation element. 1763 * 1764 * Note: This is equivalent to 1765 * UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status); 1766 * 1767 * @param range receiving the iterator index before/after fetching the CE. 1768 * @return The previous collation elements ordering, otherwise returns 1769 * PROCESSED_NULLORDER if an error has occurred or if the start of 1770 * string has been reached. 1771 */ previousProcessed(Range range)1772 public long previousProcessed(Range range) { 1773 long result = CollationElementIterator.IGNORABLE; 1774 int low = 0, high = 0; 1775 1776 // pceBuffer_.reset(); 1777 1778 while (pceBuffer_.empty()) { 1779 // buffer raw CEs up to non-ignorable primary 1780 RCEBuffer rceb = new RCEBuffer(); 1781 int ce; 1782 1783 boolean finish = false; 1784 1785 // **** do we need to reset rceb, or will it always be empty at this point **** 1786 do { 1787 high = cei_.getOffset(); 1788 ce = cei_.previous(); 1789 low = cei_.getOffset(); 1790 1791 if (ce == CollationElementIterator.NULLORDER) { 1792 if (!rceb.empty()) { 1793 break; 1794 } 1795 1796 finish = true; 1797 break; 1798 } 1799 1800 rceb.put(ce, low, high); 1801 } while ((ce & PRIMARYORDERMASK) == 0 || isContinuation(ce)); 1802 1803 if (finish) { 1804 break; 1805 } 1806 1807 // process the raw CEs 1808 while (!rceb.empty()) { 1809 RCEI rcei = rceb.get(); 1810 1811 result = processCE(rcei.ce_); 1812 1813 if (result != CollationElementIterator.IGNORABLE) { 1814 pceBuffer_.put(result, rcei.low_, rcei.high_); 1815 } 1816 } 1817 } 1818 1819 if (pceBuffer_.empty()) { 1820 // **** Is -1 the right value for ixLow, ixHigh? **** 1821 if (range != null) { 1822 range.ixLow_ = -1; 1823 range.ixHigh_ = -1; 1824 } 1825 return CollationElementIterator.NULLORDER; 1826 } 1827 1828 PCEI pcei = pceBuffer_.get(); 1829 1830 if (range != null) { 1831 range.ixLow_ = pcei.low_; 1832 range.ixHigh_ = pcei.high_; 1833 } 1834 1835 return pcei.ce_; 1836 } 1837 isContinuation(int ce)1838 private static boolean isContinuation(int ce) { 1839 return ((ce & CONTINUATION_MARKER) == CONTINUATION_MARKER); 1840 } 1841 1842 public static final class Range { 1843 int ixLow_; 1844 int ixHigh_; 1845 } 1846 1847 /** Processed collation element buffer stuff ported from ICU4C ucoleitr.cpp */ 1848 private static final class PCEI { 1849 long ce_; 1850 int low_; 1851 int high_; 1852 } 1853 1854 private static final class PCEBuffer { 1855 private PCEI[] buffer_ = new PCEI[DEFAULT_BUFFER_SIZE]; 1856 private int bufferIndex_ = 0; 1857 reset()1858 void reset() { 1859 bufferIndex_ = 0; 1860 } 1861 empty()1862 boolean empty() { 1863 return bufferIndex_ <= 0; 1864 } 1865 put(long ce, int ixLow, int ixHigh)1866 void put(long ce, int ixLow, int ixHigh) 1867 { 1868 if (bufferIndex_ >= buffer_.length) { 1869 PCEI[] newBuffer = new PCEI[buffer_.length + BUFFER_GROW]; 1870 System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length); 1871 buffer_ = newBuffer; 1872 } 1873 buffer_[bufferIndex_] = new PCEI(); 1874 buffer_[bufferIndex_].ce_ = ce; 1875 buffer_[bufferIndex_].low_ = ixLow; 1876 buffer_[bufferIndex_].high_ = ixHigh; 1877 1878 bufferIndex_ += 1; 1879 } 1880 get()1881 PCEI get() { 1882 if (bufferIndex_ > 0) { 1883 return buffer_[--bufferIndex_]; 1884 } 1885 return null; 1886 } 1887 } 1888 1889 /** Raw collation element buffer stuff ported from ICU4C ucoleitr.cpp */ 1890 private static final class RCEI { 1891 int ce_; 1892 int low_; 1893 int high_; 1894 } 1895 1896 private static final class RCEBuffer { 1897 private RCEI[] buffer_ = new RCEI[DEFAULT_BUFFER_SIZE]; 1898 private int bufferIndex_ = 0; 1899 empty()1900 boolean empty() { 1901 return bufferIndex_ <= 0; 1902 } 1903 put(int ce, int ixLow, int ixHigh)1904 void put(int ce, int ixLow, int ixHigh) { 1905 if (bufferIndex_ >= buffer_.length) { 1906 RCEI[] newBuffer = new RCEI[buffer_.length + BUFFER_GROW]; 1907 System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length); 1908 buffer_ = newBuffer; 1909 } 1910 buffer_[bufferIndex_] = new RCEI(); 1911 buffer_[bufferIndex_].ce_ = ce; 1912 buffer_[bufferIndex_].low_ = ixLow; 1913 buffer_[bufferIndex_].high_ = ixHigh; 1914 1915 bufferIndex_ += 1; 1916 } 1917 get()1918 RCEI get() { 1919 if (bufferIndex_ > 0) { 1920 return buffer_[--bufferIndex_]; 1921 } 1922 return null; 1923 } 1924 } 1925 } 1926 1927 /** 1928 * Java port of ICU4C CEI (usearch.cpp) 1929 * 1930 * CEI Collation Element + source text index. 1931 * These structs are kept in the circular buffer. 1932 */ 1933 private static class CEI { 1934 long ce_; 1935 int lowIndex_; 1936 int highIndex_; 1937 } 1938 1939 /** 1940 * CEBuffer A circular buffer of CEs from the text being searched 1941 */ 1942 private static class CEBuffer { 1943 // Java porting note: ICU4C uses the size for stack buffer 1944 // static final int DEFAULT_CEBUFFER_SIZE = 96; 1945 1946 static final int CEBUFFER_EXTRA = 32; 1947 static final int MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L = 8; 1948 static final int MAX_TARGET_IGNORABLES_PER_PAT_OTHER = 3; 1949 1950 CEI[] buf_; 1951 int bufSize_; 1952 int firstIx_; 1953 int limitIx_; 1954 1955 // Java porting note: No references in ICU4C implementation 1956 // CollationElementIterator ceIter_; 1957 1958 StringSearch strSearch_; 1959 CEBuffer(StringSearch ss)1960 CEBuffer(StringSearch ss) { 1961 strSearch_ = ss; 1962 bufSize_ = ss.pattern_.PCELength_ + CEBUFFER_EXTRA; 1963 if (ss.search_.elementComparisonType_ != ElementComparisonType.STANDARD_ELEMENT_COMPARISON) { 1964 String patText = ss.pattern_.text_; 1965 if (patText != null) { 1966 for (int i = 0; i < patText.length(); i++) { 1967 char c = patText.charAt(i); 1968 if (MIGHT_BE_JAMO_L(c)) { 1969 bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 1970 } else { 1971 // No check for surrogates, we might allocate slightly more buffer than necessary. 1972 bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 1973 } 1974 } 1975 } 1976 } 1977 1978 // Not used - see above 1979 // ceIter_ = ss.textIter_; 1980 1981 firstIx_ = 0; 1982 limitIx_ = 0; 1983 1984 if (!ss.initTextProcessedIter()) { 1985 return; 1986 } 1987 1988 buf_ = new CEI[bufSize_]; 1989 } 1990 1991 // Get the CE with the specified index. 1992 // Index must be in the range 1993 // n-history_size < index < n+1 1994 // where n is the largest index to have been fetched by some previous call to this function. 1995 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 1996 // get(int index)1997 CEI get(int index) { 1998 int i = index % bufSize_; 1999 2000 if (index >= firstIx_ && index < limitIx_) { 2001 // The request was for an entry already in our buffer. 2002 // Just return it. 2003 return buf_[i]; 2004 } 2005 2006 // Caller is requesting a new, never accessed before, CE. 2007 // Verify that it is the next one in sequence, which is all 2008 // that is allowed. 2009 if (index != limitIx_) { 2010 assert(false); 2011 return null; 2012 } 2013 2014 // Manage the circular CE buffer indexing 2015 limitIx_++; 2016 2017 if (limitIx_ - firstIx_ >= bufSize_) { 2018 // The buffer is full, knock out the lowest-indexed entry. 2019 firstIx_++; 2020 } 2021 2022 CollationPCE.Range range = new CollationPCE.Range(); 2023 if (buf_[i] == null) { 2024 buf_[i] = new CEI(); 2025 } 2026 buf_[i].ce_ = strSearch_.textProcessedIter_.nextProcessed(range); 2027 buf_[i].lowIndex_ = range.ixLow_; 2028 buf_[i].highIndex_ = range.ixHigh_; 2029 2030 return buf_[i]; 2031 } 2032 2033 // Get the CE with the specified index. 2034 // Index must be in the range 2035 // n-history_size < index < n+1 2036 // where n is the largest index to have been fetched by some previous call to this function. 2037 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 2038 // getPrevious(int index)2039 CEI getPrevious(int index) { 2040 int i = index % bufSize_; 2041 2042 if (index >= firstIx_ && index < limitIx_) { 2043 // The request was for an entry already in our buffer. 2044 // Just return it. 2045 return buf_[i]; 2046 } 2047 2048 // Caller is requesting a new, never accessed before, CE. 2049 // Verify that it is the next one in sequence, which is all 2050 // that is allowed. 2051 if (index != limitIx_) { 2052 assert(false); 2053 return null; 2054 } 2055 2056 // Manage the circular CE buffer indexing 2057 limitIx_++; 2058 2059 if (limitIx_ - firstIx_ >= bufSize_) { 2060 // The buffer is full, knock out the lowest-indexed entry. 2061 firstIx_++; 2062 } 2063 2064 CollationPCE.Range range = new CollationPCE.Range(); 2065 if (buf_[i] == null) { 2066 buf_[i] = new CEI(); 2067 } 2068 buf_[i].ce_ = strSearch_.textProcessedIter_.previousProcessed(range); 2069 buf_[i].lowIndex_ = range.ixLow_; 2070 buf_[i].highIndex_ = range.ixHigh_; 2071 2072 return buf_[i]; 2073 } 2074 MIGHT_BE_JAMO_L(char c)2075 static boolean MIGHT_BE_JAMO_L(char c) { 2076 return (c >= 0x1100 && c <= 0x115E) 2077 || (c >= 0x3131 && c <= 0x314E) 2078 || (c >= 0x3165 && c <= 0x3186); 2079 } 2080 } 2081 } 2082