• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  *******************************************************************************
6  * Copyright (C) 1996-2016, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  *******************************************************************************
9  */
10 package ohos.global.icu.text;
11 
12 import java.text.CharacterIterator;
13 import java.text.StringCharacterIterator;
14 import java.util.Locale;
15 
16 import ohos.global.icu.util.ICUException;
17 import ohos.global.icu.util.ULocale;
18 
19 // Java porting note:
20 //
21 //        The ICU4C implementation contains dead code in many places.
22 //      While porting the ICU4C linear search implementation, this dead code
23 //      was not fully ported. The code blocks tagged by "// *** Boyer-Moore ***"
24 //      are those dead code blocks, still available in ICU4C.
25 
26 //        The ICU4C implementation does not seem to handle UCharacterIterator pointing
27 //      to a fragment of text properly. ICU4J uses CharacterIterator to navigate through
28 //      the input text. We need to carefully review the code ported from ICU4C
29 //      assuming the start index is 0.
30 
31 //        ICU4C implementation initializes pattern.CE and pattern.PCE. It looks like
32 //      CE is no longer used, except in a few places checking CELength. It looks like this
33 //      is a leftover from already-disabled Boyer-Moore search code. This Java implementation
34 //      preserves the code, but we should clean this up later.
35 
36 /**
37  *
38  * <tt>StringSearch</tt> is a {@link SearchIterator} that provides
39  * language-sensitive text searching based on the comparison rules defined
40  * in a {@link RuleBasedCollator} object.
41  * StringSearch ensures that language eccentricity can be
42  * handled, e.g. for the German collator, characters &szlig; and SS will be matched
43  * if case is chosen to be ignored.
44  * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm">
45  * "ICU Collation Design Document"</a> for more information.
46  * <p>
47  * There are 2 match options for selection:<br>
48  * Let S' be the sub-string of a text string S between the offsets start and
49  * end [start, end].
50  * <br>
51  * A pattern string P matches a text string S at the offsets [start, end]
52  * if
53  * <pre>
54  * option 1. Some canonical equivalent of P matches some canonical equivalent
55  *           of S'
56  * option 2. P matches S' and if P starts or ends with a combining mark,
57  *           there exists no non-ignorable combining mark before or after S?
58  *           in S respectively.
59  * </pre>
60  * Option 2. is the default.
61  * <p>
62  * This search has APIs similar to that of other text iteration mechanisms
63  * such as the break iterators in {@link BreakIterator}. Using these
64  * APIs, it is easy to scan through text looking for all occurrences of
65  * a given pattern. This search iterator allows changing of direction by
66  * calling a {@link #reset} followed by a {@link #next} or {@link #previous}.
67  * Though a direction change can occur without calling {@link #reset} first,
68  * this operation comes with some speed penalty.
69  * Match results in the forward direction will match the result matches in
70  * the backwards direction in the reverse order
71  * <p>
72  * {@link SearchIterator} provides APIs to specify the starting position
73  * within the text string to be searched, e.g. {@link SearchIterator#setIndex setIndex},
74  * {@link SearchIterator#preceding preceding} and {@link SearchIterator#following following}.
75  * Since the starting position will be set as it is specified, please take note that
76  * there are some danger points at which the search may render incorrect
77  * results:
78  * <ul>
79  * <li> In the midst of a substring that requires normalization.
80  * <li> If the following match is to be found, the position should not be the
81  *      second character which requires swapping with the preceding
82  *      character. Vice versa, if the preceding match is to be found, the
83  *      position to search from should not be the first character which
84  *      requires swapping with the next character. E.g certain Thai and
85  *      Lao characters require swapping.
86  * <li> If a following pattern match is to be found, any position within a
87  *      contracting sequence except the first will fail. Vice versa if a
88  *      preceding pattern match is to be found, an invalid starting point
89  *      would be any character within a contracting sequence except the last.
90  * </ul>
91  * <p>
92  * A {@link BreakIterator} can be used if only matches at logical breaks are desired.
93  * Using a {@link BreakIterator} will only give you results that exactly matches the
94  * boundaries given by the {@link BreakIterator}. For instance the pattern "e" will
95  * not be found in the string "\u00e9" if a character break iterator is used.
96  * <p>
97  * Options are provided to handle overlapping matches.
98  * E.g. In English, overlapping matches produces the result 0 and 2
99  * for the pattern "abab" in the text "ababab", where mutually
100  * exclusive matches only produces the result of 0.
101  * <p>
102  * Options are also provided to implement "asymmetric search" as described in
103  * <a href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">
104  * UTS #10 Unicode Collation Algorithm</a>, specifically the ElementComparisonType
105  * values.
106  * <p>
107  * Though collator attributes will be taken into consideration while
108  * performing matches, there are no APIs here for setting and getting the
109  * attributes. These attributes can be set by getting the collator
110  * from {@link #getCollator} and using the APIs in {@link RuleBasedCollator}.
111  * Lastly to update <tt>StringSearch</tt> to the new collator attributes,
112  * {@link #reset} has to be called.
113  * <p>
114  * Restriction: <br>
115  * Currently there are no composite characters that consists of a
116  * character with combining class &gt; 0 before a character with combining
117  * class == 0. However, if such a character exists in the future,
118  * <tt>StringSearch</tt> does not guarantee the results for option 1.
119  * <p>
120  * Consult the {@link SearchIterator} documentation for information on
121  * and examples of how to use instances of this class to implement text
122  * searching.
123  * <p>
124  * Note, <tt>StringSearch</tt> is not to be subclassed.
125  * </p>
126  * @see SearchIterator
127  * @see RuleBasedCollator
128  * @author Laura Werner, synwee
129  */
130 // internal notes: all methods do not guarantee the correct status of the
131 // characteriterator. the caller has to maintain the original index position
132 // if necessary. methods could change the index position as it deems fit
133 public final class StringSearch extends SearchIterator {
134 
135     private Pattern pattern_;
136     private RuleBasedCollator collator_;
137 
138     // positions within the collation element iterator is used to determine
139     // if we are at the start of the text.
140     private CollationElementIterator textIter_;
141     private CollationPCE textProcessedIter_;
142 
143     // utility collation element, used throughout program for temporary
144     // iteration.
145     private CollationElementIterator utilIter_;
146 
147     private Normalizer2 nfd_;
148 
149     private int strength_;
150     int ceMask_;
151     int variableTop_;
152 
153     private boolean toShift_;
154 
155     // *** Boyer-Moore ***
156     // private char[] canonicalPrefixAccents_;
157     // private char[] canonicalSuffixAccents_;
158 
159     /**
160      * Initializes the iterator to use the language-specific rules defined in
161      * the argument collator to search for argument pattern in the argument
162      * target text. The argument <code>breakiter</code> is used to define logical matches.
163      * See super class documentation for more details on the use of the target
164      * text and {@link BreakIterator}.
165      * @param pattern text to look for.
166      * @param target target text to search for pattern.
167      * @param collator {@link RuleBasedCollator} that defines the language rules
168      * @param breakiter A {@link BreakIterator} that is used to determine the
169      *                boundaries of a logical match. This argument can be null.
170      * @throws IllegalArgumentException thrown when argument target is null,
171      *            or of length 0
172      * @see BreakIterator
173      * @see RuleBasedCollator
174      */
StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator, BreakIterator breakiter)175     public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator,
176             BreakIterator breakiter) {
177 
178         // This implementation is ported from ICU4C usearch_open()
179 
180         super(target, breakiter);
181 
182         // string search does not really work when numeric collation is turned on
183         if (collator.getNumericCollation()) {
184             throw new UnsupportedOperationException("Numeric collation is not supported by StringSearch");
185         }
186 
187         collator_ = collator;
188         strength_ = collator.getStrength();
189         ceMask_ = getMask(strength_);
190         toShift_ = collator.isAlternateHandlingShifted();
191         variableTop_ = collator.getVariableTop();
192 
193         nfd_ = Normalizer2.getNFDInstance();
194 
195         pattern_ = new Pattern(pattern);
196 
197         search_.setMatchedLength(0);
198         search_.matchedIndex_ = DONE;
199 
200         utilIter_ = null;
201         textIter_ = new CollationElementIterator(target, collator);
202 
203         textProcessedIter_ = null;
204 
205         // This is done by super class constructor
206         /*
207         search_.isOverlap_ = false;
208         search_.isCanonicalMatch_ = false;
209         search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
210         search_.isForwardSearching_ = true;
211         search_.reset_ = true;
212          */
213         ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE);
214         search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale);
215         search_.internalBreakIter_.setText((CharacterIterator)target.clone());  // We need to create a clone
216 
217         initialize();
218     }
219 
220     /**
221      * Initializes the iterator to use the language-specific rules defined in
222      * the argument collator to search for argument pattern in the argument
223      * target text. No {@link BreakIterator}s are set to test for logical matches.
224      * @param pattern text to look for.
225      * @param target target text to search for pattern.
226      * @param collator {@link RuleBasedCollator} that defines the language rules
227      * @throws IllegalArgumentException thrown when argument target is null,
228      *            or of length 0
229      * @see RuleBasedCollator
230      */
StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator)231     public StringSearch(String pattern, CharacterIterator target, RuleBasedCollator collator) {
232         this(pattern, target, collator, null);
233     }
234 
235     /**
236      * Initializes the iterator to use the language-specific rules and
237      * break iterator rules defined in the argument locale to search for
238      * argument pattern in the argument target text.
239      * @param pattern text to look for.
240      * @param target target text to search for pattern.
241      * @param locale locale to use for language and break iterator rules
242      * @throws IllegalArgumentException thrown when argument target is null,
243      *            or of length 0. ClassCastException thrown if the collator for
244      *            the specified locale is not a RuleBasedCollator.
245      */
StringSearch(String pattern, CharacterIterator target, Locale locale)246     public StringSearch(String pattern, CharacterIterator target, Locale locale) {
247         this(pattern, target, ULocale.forLocale(locale));
248     }
249 
250     /**
251      * Initializes the iterator to use the language-specific rules and
252      * break iterator rules defined in the argument locale to search for
253      * argument pattern in the argument target text.
254      * See super class documentation for more details on the use of the target
255      * text and {@link BreakIterator}.
256      * @param pattern text to look for.
257      * @param target target text to search for pattern.
258      * @param locale locale to use for language and break iterator rules
259      * @throws IllegalArgumentException thrown when argument target is null,
260      *            or of length 0. ClassCastException thrown if the collator for
261      *            the specified locale is not a RuleBasedCollator.
262      * @see BreakIterator
263      * @see RuleBasedCollator
264      * @see SearchIterator
265      */
StringSearch(String pattern, CharacterIterator target, ULocale locale)266     public StringSearch(String pattern, CharacterIterator target, ULocale locale) {
267         this(pattern, target, (RuleBasedCollator) Collator.getInstance(locale), null);
268     }
269 
270     /**
271      * Initializes the iterator to use the language-specific rules and
272      * break iterator rules defined in the default locale to search for
273      * argument pattern in the argument target text.
274      * @param pattern text to look for.
275      * @param target target text to search for pattern.
276      * @throws IllegalArgumentException thrown when argument target is null,
277      *            or of length 0. ClassCastException thrown if the collator for
278      *            the default locale is not a RuleBasedCollator.
279      */
StringSearch(String pattern, String target)280     public StringSearch(String pattern, String target) {
281         this(pattern, new StringCharacterIterator(target),
282                 (RuleBasedCollator) Collator.getInstance(), null);
283     }
284 
285     /**
286      * Gets the {@link RuleBasedCollator} used for the language rules.
287      * <p>
288      * Since <tt>StringSearch</tt> depends on the returned {@link RuleBasedCollator}, any
289      * changes to the {@link RuleBasedCollator} result should follow with a call to
290      * either {@link #reset()} or {@link #setCollator(RuleBasedCollator)} to ensure the correct
291      * search behavior.
292      * </p>
293      * @return {@link RuleBasedCollator} used by this <tt>StringSearch</tt>
294      * @see RuleBasedCollator
295      * @see #setCollator
296      */
getCollator()297     public RuleBasedCollator getCollator() {
298         return collator_;
299     }
300 
301     /**
302      * Sets the {@link RuleBasedCollator} to be used for language-specific searching.
303      * <p>
304      * The iterator's position will not be changed by this method.
305      * @param collator to use for this <tt>StringSearch</tt>
306      * @throws IllegalArgumentException thrown when collator is null
307      * @see #getCollator
308      */
setCollator(RuleBasedCollator collator)309     public void setCollator(RuleBasedCollator collator) {
310         if (collator == null) {
311             throw new IllegalArgumentException("Collator can not be null");
312         }
313         collator_ = collator;
314         ceMask_ = getMask(collator_.getStrength());
315 
316         ULocale collLocale = collator.getLocale(ULocale.VALID_LOCALE);
317         search_.internalBreakIter_ = BreakIterator.getCharacterInstance(collLocale == null ? ULocale.ROOT : collLocale);
318         search_.internalBreakIter_.setText((CharacterIterator)search_.text().clone());  // We need to create a clone
319 
320         toShift_ = collator.isAlternateHandlingShifted();
321         variableTop_ = collator.getVariableTop();
322         textIter_ = new CollationElementIterator(pattern_.text_, collator);
323         utilIter_ = new CollationElementIterator(pattern_.text_, collator);
324 
325         // initialize() _after_ setting the iterators for the new collator.
326         initialize();
327     }
328 
329     /**
330      * Returns the pattern for which <tt>StringSearch</tt> is searching for.
331      * @return the pattern searched for
332      */
getPattern()333     public String getPattern() {
334         return pattern_.text_;
335     }
336 
337     /**
338      * Set the pattern to search for.
339      * The iterator's position will not be changed by this method.
340      * @param pattern for searching
341      * @see #getPattern
342      * @exception IllegalArgumentException thrown if pattern is null or of
343      *               length 0
344      */
setPattern(String pattern)345     public void setPattern(String pattern) {
346         if (pattern == null || pattern.length() <= 0) {
347             throw new IllegalArgumentException(
348                     "Pattern to search for can not be null or of length 0");
349         }
350         pattern_.text_ = pattern;
351         initialize();
352     }
353 
354     /**
355      * Determines whether canonical matches (option 1, as described in the
356      * class documentation) is set.
357      * See setCanonical(boolean) for more information.
358      * @see #setCanonical
359      * @return true if canonical matches is set, false otherwise
360      */
361     //TODO: hoist this to SearchIterator
isCanonical()362     public boolean isCanonical() {
363         return search_.isCanonicalMatch_;
364     }
365 
366     /**
367      * Set the canonical match mode. See class documentation for details.
368      * The default setting for this property is false.
369      * @param allowCanonical flag indicator if canonical matches are allowed
370      * @see #isCanonical
371      */
372     //TODO: hoist this to SearchIterator
setCanonical(boolean allowCanonical)373     public void setCanonical(boolean allowCanonical) {
374         search_.isCanonicalMatch_ = allowCanonical;
375     }
376 
377     /**
378      * {@inheritDoc}
379      */
380     @Override
setTarget(CharacterIterator text)381     public void setTarget(CharacterIterator text) {
382         super.setTarget(text);
383         textIter_.setText(text);
384     }
385 
386     /**
387      * {@inheritDoc}
388      */
389     @Override
getIndex()390     public int getIndex() {
391         int result = textIter_.getOffset();
392         if (isOutOfBounds(search_.beginIndex(), search_.endIndex(), result)) {
393             return DONE;
394         }
395         return result;
396     }
397 
398     /**
399      * {@inheritDoc}
400      */
401     @Override
setIndex(int position)402     public void setIndex(int position) {
403         // Java porting note: This method is equivalent to setOffset() in ICU4C.
404         // ICU4C SearchIterator::setOffset() is a pure virtual method, while
405         // ICU4J SearchIterator.setIndex() is not abstract method.
406 
407         super.setIndex(position);
408         textIter_.setOffset(position);
409     }
410 
411     /**
412      * {@inheritDoc}
413      */
414     @Override
reset()415     public void reset() {
416         // reset is setting the attributes that are already in
417         // string search, hence all attributes in the collator should
418         // be retrieved without any problems
419 
420         boolean sameCollAttribute = true;
421         int ceMask;
422         boolean shift;
423         int varTop;
424 
425         // **** hack to deal w/ how processed CEs encode quaternary ****
426         int newStrength = collator_.getStrength();
427         if ((strength_ < Collator.QUATERNARY && newStrength >= Collator.QUATERNARY)
428                 || (strength_ >= Collator.QUATERNARY && newStrength < Collator.QUATERNARY)) {
429             sameCollAttribute = false;
430         }
431 
432         strength_ = collator_.getStrength();
433         ceMask = getMask(strength_);
434         if (ceMask_ != ceMask) {
435             ceMask_ = ceMask;
436             sameCollAttribute = false;
437         }
438 
439         shift = collator_.isAlternateHandlingShifted();
440         if (toShift_ != shift) {
441             toShift_ = shift;
442             sameCollAttribute = false;
443         }
444 
445         varTop = collator_.getVariableTop();
446         if (variableTop_ != varTop) {
447             variableTop_ = varTop;
448             sameCollAttribute = false;
449         }
450 
451         if (!sameCollAttribute) {
452             initialize();
453         }
454 
455         textIter_.setText(search_.text());
456 
457         search_.setMatchedLength(0);
458         search_.matchedIndex_ = DONE;
459         search_.isOverlap_ = false;
460         search_.isCanonicalMatch_ = false;
461         search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
462         search_.isForwardSearching_ = true;
463         search_.reset_ = true;
464     }
465 
466     /**
467      * {@inheritDoc}
468      */
469     @Override
handleNext(int position)470     protected int handleNext(int position) {
471         if (pattern_.CELength_ == 0) {
472             search_.matchedIndex_ = search_.matchedIndex_ == DONE ?
473                                     getIndex() : search_.matchedIndex_ + 1;
474             search_.setMatchedLength(0);
475             textIter_.setOffset(search_.matchedIndex_);
476             if (search_.matchedIndex_ == search_.endIndex()) {
477                 search_.matchedIndex_ = DONE;
478             }
479         } else {
480             if (search_.matchedLength() <= 0) {
481                 // the flipping direction issue has already been handled
482                 // in next()
483                 // for boundary check purposes. this will ensure that the
484                 // next match will not preceed the current offset
485                 // note search_.matchedIndex_ will always be set to something
486                 // in the code
487                 search_.matchedIndex_ = position - 1;
488             }
489 
490             textIter_.setOffset(position);
491 
492             // ICU4C comment:
493             // if strsrch_->breakIter is always the same as m_breakiterator_
494             // then we don't need to check the match boundaries here because
495             // usearch_handleNextXXX will already have done it.
496             if (search_.isCanonicalMatch_) {
497                 // *could* actually use exact here 'cause no extra accents allowed...
498                 handleNextCanonical();
499             } else {
500                 handleNextExact();
501             }
502 
503             if (search_.matchedIndex_ == DONE) {
504                 textIter_.setOffset(search_.endIndex());
505             } else {
506                 textIter_.setOffset(search_.matchedIndex_);
507             }
508 
509             return search_.matchedIndex_;
510         }
511 
512         return DONE;
513     }
514 
515     /**
516      * {@inheritDoc}
517      */
518     @Override
handlePrevious(int position)519     protected int handlePrevious(int position) {
520         if (pattern_.CELength_ == 0) {
521             search_.matchedIndex_ =
522                     search_.matchedIndex_ == DONE ? getIndex() : search_.matchedIndex_;
523             if (search_.matchedIndex_ == search_.beginIndex()) {
524                 setMatchNotFound();
525             } else {
526                 search_.matchedIndex_--;
527                 textIter_.setOffset(search_.matchedIndex_);
528                 search_.setMatchedLength(0);
529             }
530         } else {
531             textIter_.setOffset(position);
532 
533             if (search_.isCanonicalMatch_) {
534                 // *could* use exact match here since extra accents *not* allowed!
535                 handlePreviousCanonical();
536             } else {
537                 handlePreviousExact();
538             }
539         }
540 
541         return search_.matchedIndex_;
542     }
543 
544     // ------------------ Internal implementation code ---------------------------
545 
546     private static final int INITIAL_ARRAY_SIZE_ = 256;
547 
548     // *** Boyer-Moore ***
549     // private static final Normalizer2Impl nfcImpl_ = Norm2AllModes.getNFCInstance().impl;
550     // private static final int LAST_BYTE_MASK_ = 0xff;
551     // private static final int SECOND_LAST_BYTE_SHIFT_ = 8;
552 
553     private static final int PRIMARYORDERMASK = 0xffff0000;
554     private static final int SECONDARYORDERMASK = 0x0000ff00;
555     private static final int TERTIARYORDERMASK = 0x000000ff;
556 
557     /**
558      * Getting the mask for collation strength
559      * @param strength collation strength
560      * @return collation element mask
561      */
getMask(int strength)562     private static int getMask(int strength) {
563         switch (strength) {
564         case Collator.PRIMARY:
565             return PRIMARYORDERMASK;
566         case Collator.SECONDARY:
567             return SECONDARYORDERMASK | PRIMARYORDERMASK;
568         default:
569             return TERTIARYORDERMASK | SECONDARYORDERMASK | PRIMARYORDERMASK;
570         }
571     }
572 
573 
574     // *** Boyer-Moore ***
575     /*
576     private final char getFCD(String str, int offset) {
577         char ch = str.charAt(offset);
578         if (ch < 0x180) {
579             return (char) nfcImpl_.getFCD16FromBelow180(ch);
580         } else if (nfcImpl_.singleLeadMightHaveNonZeroFCD16(ch)) {
581             if (!Character.isHighSurrogate(ch)) {
582                 return (char) nfcImpl_.getFCD16FromNormData(ch);
583             } else {
584                 char c2;
585                 if (++offset < str.length() && Character.isLowSurrogate(c2 = str.charAt(offset))) {
586                     return (char) nfcImpl_.getFCD16FromNormData(Character.toCodePoint(ch, c2));
587                 }
588             }
589         }
590         return 0;
591     }
592 
593     private final char getFCD(int c) {
594         return (char)nfcImpl_.getFCD16(c);
595     }
596     */
597 
598     /**
599      * Getting the modified collation elements taking into account the collation
600      * attributes.
601      *
602      * @param sourcece
603      * @return the modified collation element
604      */
getCE(int sourcece)605     private int getCE(int sourcece) {
606         // note for tertiary we can't use the collator->tertiaryMask, that
607         // is a preprocessed mask that takes into account case options. since
608         // we are only concerned with exact matches, we don't need that.
609         sourcece &= ceMask_;
610 
611         if (toShift_) {
612             // alternate handling here, since only the 16 most significant digits
613             // is only used, we can safely do a compare without masking
614             // if the ce is a variable, we mask and get only the primary values
615             // no shifting to quartenary is required since all primary values
616             // less than variabletop will need to be masked off anyway.
617             if (variableTop_ > sourcece) {
618                 if (strength_ >= Collator.QUATERNARY) {
619                     sourcece &= PRIMARYORDERMASK;
620                 } else {
621                     sourcece = CollationElementIterator.IGNORABLE;
622                 }
623             }
624         } else if (strength_ >= Collator.QUATERNARY && sourcece == CollationElementIterator.IGNORABLE) {
625             sourcece = 0xFFFF;
626         }
627 
628         return sourcece;
629     }
630 
631     /**
632      * Direct port of ICU4C static int32_t * addTouint32_tArray(...) in usearch.cpp
633      * (except not taking destination buffer size and status param).
634      * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
635      * implement this in Pattern class.
636      *
637      * @param destination target array
638      * @param offset destination offset to add value
639      * @param value to be added
640      * @param increments incremental size expected
641      * @return new destination array, destination if there was no new allocation
642      */
addToIntArray(int[] destination, int offset, int value, int increments)643     private static int[] addToIntArray(int[] destination, int offset, int value, int increments) {
644         int newlength = destination.length;
645         if (offset + 1 == newlength) {
646             newlength += increments;
647             int temp[] = new int[newlength];
648             System.arraycopy(destination, 0, temp, 0, offset);
649             destination = temp;
650         }
651         destination[offset] = value;
652         return destination;
653     }
654 
655     /**
656      * Direct port of ICU4C static int64_t * addTouint64_tArray(...) in usearch.cpp.
657      * This is used for appending a PCE to Pattern.PCE_ buffer. We probably should
658      * implement this in Pattern class.
659      *
660      * @param destination target array
661      * @param offset destination offset to add value
662      * @param destinationlength target array size
663      * @param value to be added
664      * @param increments incremental size expected
665      * @return new destination array, destination if there was no new allocation
666      */
addToLongArray(long[] destination, int offset, int destinationlength, long value, int increments)667     private static long[] addToLongArray(long[] destination, int offset, int destinationlength,
668             long value, int increments) {
669         int newlength = destinationlength;
670         if (offset + 1 == newlength) {
671             newlength += increments;
672             long temp[] = new long[newlength];
673             System.arraycopy(destination, 0, temp, 0, offset);
674             destination = temp;
675         }
676         destination[offset] = value;
677         return destination;
678     }
679 
680     /**
681      * Initializing the ce table for a pattern.
682      * Stores non-ignorable collation keys.
683      * Table size will be estimated by the size of the pattern text. Table
684      * expansion will be perform as we go along. Adding 1 to ensure that the table
685      * size definitely increases.
686      * @return total number of expansions
687      */
688     // TODO: We probably do not need Pattern CE table.
initializePatternCETable()689     private int initializePatternCETable() {
690         int[] cetable = new int[INITIAL_ARRAY_SIZE_];
691         int patternlength = pattern_.text_.length();
692         CollationElementIterator coleiter = utilIter_;
693 
694         if (coleiter == null) {
695             coleiter = new CollationElementIterator(pattern_.text_, collator_);
696             utilIter_ = coleiter;
697         } else {
698             coleiter.setText(pattern_.text_);
699         }
700 
701         int offset = 0;
702         int result = 0;
703         int ce;
704 
705         while ((ce = coleiter.next()) != CollationElementIterator.NULLORDER) {
706             int newce = getCE(ce);
707             if (newce != CollationElementIterator.IGNORABLE /* 0 */) {
708                 int[] temp = addToIntArray(cetable, offset, newce,
709                         patternlength - coleiter.getOffset() + 1);
710                 offset++;
711                 cetable = temp;
712             }
713             result += (coleiter.getMaxExpansion(ce) - 1);
714         }
715 
716         cetable[offset] = 0;
717         pattern_.CE_ = cetable;
718         pattern_.CELength_ = offset;
719 
720         return result;
721     }
722 
723     /**
724      * Initializing the pce table for a pattern.
725      * Stores non-ignorable collation keys.
726      * Table size will be estimated by the size of the pattern text. Table
727      * expansion will be perform as we go along. Adding 1 to ensure that the table
728      * size definitely increases.
729      * @return total number of expansions
730      */
initializePatternPCETable()731     private int initializePatternPCETable() {
732         long[] pcetable = new long[INITIAL_ARRAY_SIZE_];
733         int pcetablesize = pcetable.length;
734         int patternlength = pattern_.text_.length();
735         CollationElementIterator coleiter = utilIter_;
736 
737         if (coleiter == null) {
738             coleiter = new CollationElementIterator(pattern_.text_, collator_);
739             utilIter_ = coleiter;
740         } else {
741             coleiter.setText(pattern_.text_);
742         }
743 
744         int offset = 0;
745         int result = 0;
746         long pce;
747 
748         CollationPCE iter = new CollationPCE(coleiter);
749 
750         // ** Should processed CEs be signed or unsigned?
751         // ** (the rest of the code in this file seems to play fast-and-loose with
752         // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
753         while ((pce = iter.nextProcessed(null)) != CollationPCE.PROCESSED_NULLORDER) {
754             long[] temp = addToLongArray(pcetable, offset, pcetablesize, pce, patternlength - coleiter.getOffset() + 1);
755             offset++;
756             pcetable = temp;
757         }
758 
759         pcetable[offset] = 0;
760         pattern_.PCE_ = pcetable;
761         pattern_.PCELength_ = offset;
762 
763         return result;
764     }
765 
766     // TODO: This method only triggers initializePatternCETable(), which is probably no
767     //      longer needed.
initializePattern()768     private int initializePattern() {
769         // Since the strength is primary, accents are ignored in the pattern.
770 
771         // *** Boyer-Moore ***
772         /*
773         if (strength_ == Collator.PRIMARY) {
774             pattern_.hasPrefixAccents_ = false;
775             pattern_.hasSuffixAccents_ = false;
776         } else {
777             pattern_.hasPrefixAccents_ = (getFCD(pattern_.text_, 0) >>> SECOND_LAST_BYTE_SHIFT_) != 0;
778             pattern_.hasSuffixAccents_ = (getFCD(pattern_.text_.codePointBefore(pattern_.text_.length())) & LAST_BYTE_MASK_) != 0;
779         }
780         */
781 
782         pattern_.PCE_ = null;
783 
784         // since intializePattern is an internal method status is a success.
785         return initializePatternCETable();
786     }
787 
788     // *** Boyer-Moore ***
789     /*
790      private final void setShiftTable(char shift[],
791                                          char backshift[],
792                                          int cetable[], int cesize,
793                                          int expansionsize,
794                                          int defaultforward,
795                                          int defaultbackward) {
796          // No implementation
797      }
798      */
799 
800     // TODO: This method only triggers initializePattern(), which is probably no
801     //      longer needed.
initialize()802     private void initialize() {
803         /* int expandlength = */ initializePattern();
804 
805         // *** Boyer-Moore ***
806         /*
807         if (pattern_.CELength_ > 0) {
808             int cesize = pattern_.CELength_;
809             int minlength = cesize > expandlength ? cesize - expandlength : 1;
810             pattern_.defaultShiftSize_ = minlength;
811             setShiftTable(pattern_.shift_, pattern_.backShift_, pattern_.CE_, cesize,
812                     expandlength, minlength, minlength);
813             return;
814         }
815         return pattern_.defaultShiftSize_;
816         */
817     }
818 
819     /**
820      * @deprecated This API is ICU internal only.
821      * @hide deprecated on icu4j-org
822      * @hide draft / provisional / internal are hidden on OHOS
823      */
824     @Override
825     @Deprecated
setMatchNotFound()826     protected void setMatchNotFound() {
827         super.setMatchNotFound();
828         // SearchIterator#setMatchNotFound() does following:
829         //      search_.matchedIndex_ = DONE;
830         //      search_.setMatchedLength(0);
831         if (search_.isForwardSearching_) {
832             textIter_.setOffset(search_.text().getEndIndex());
833         } else {
834             textIter_.setOffset(0);
835         }
836     }
837 
838     /**
839      * Checks if the offset runs out of the text string range
840      * @param textstart offset of the first character in the range
841      * @param textlimit limit offset of the text string range
842      * @param offset to test
843      * @return true if offset is out of bounds, false otherwise
844      */
isOutOfBounds(int textstart, int textlimit, int offset)845     private static final boolean isOutOfBounds(int textstart, int textlimit, int offset) {
846         return offset < textstart || offset > textlimit;
847     }
848 
849     /**
850      * Checks for identical match
851      * @param start offset of possible match
852      * @param end offset of possible match
853      * @return TRUE if identical match is found
854      */
checkIdentical(int start, int end)855     private boolean checkIdentical(int start, int end) {
856         if (strength_ != Collator.IDENTICAL) {
857             return true;
858         }
859         // Note: We could use Normalizer::compare() or similar, but for short strings
860         // which may not be in FCD it might be faster to just NFD them.
861         String textstr = getString(targetText, start, end - start);
862         if (Normalizer.quickCheck(textstr, Normalizer.NFD, 0) == Normalizer.NO) {
863             textstr = Normalizer.decompose(textstr, false);
864         }
865         String patternstr = pattern_.text_;
866         if (Normalizer.quickCheck(patternstr, Normalizer.NFD, 0) == Normalizer.NO) {
867             patternstr = Normalizer.decompose(patternstr, false);
868         }
869         return textstr.equals(patternstr);
870     }
871 
initTextProcessedIter()872     private boolean initTextProcessedIter() {
873         if (textProcessedIter_ == null) {
874             textProcessedIter_ = new CollationPCE(textIter_);
875         } else {
876             textProcessedIter_.init(textIter_);
877         }
878         return true;
879     }
880 
881     /*
882      * Find the next break boundary after startIndex. If the UStringSearch object
883      * has an external break iterator, use that. Otherwise use the internal character
884      * break iterator.
885      */
nextBoundaryAfter(int startIndex)886     private int nextBoundaryAfter(int startIndex) {
887         BreakIterator breakiterator = search_.breakIter();
888 
889         if (breakiterator == null) {
890             breakiterator = search_.internalBreakIter_;
891         }
892 
893         if (breakiterator != null) {
894             return breakiterator.following(startIndex);
895         }
896 
897         return startIndex;
898     }
899 
900     /*
901      * Returns TRUE if index is on a break boundary. If the UStringSearch
902      * has an external break iterator, test using that, otherwise test
903      * using the internal character break iterator.
904      */
isBreakBoundary(int index)905     private boolean isBreakBoundary(int index) {
906         BreakIterator breakiterator = search_.breakIter();
907 
908         if (breakiterator == null) {
909             breakiterator = search_.internalBreakIter_;
910         }
911 
912         return (breakiterator != null && breakiterator.isBoundary(index));
913     }
914 
915 
916     // Java porting note: Followings are corresponding to UCompareCEsResult enum
917     private static final int CE_MATCH = -1;
918     private static final int CE_NO_MATCH = 0;
919     private static final int CE_SKIP_TARG = 1;
920     private static final int CE_SKIP_PATN = 2;
921 
922     private static int CE_LEVEL2_BASE = 0x00000005;
923     private static int CE_LEVEL3_BASE = 0x00050000;
924 
compareCE64s(long targCE, long patCE, ElementComparisonType compareType)925     private static int compareCE64s(long targCE, long patCE, ElementComparisonType compareType) {
926         if (targCE == patCE) {
927             return CE_MATCH;
928         }
929         if (compareType == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
930             return CE_NO_MATCH;
931         }
932 
933         long targCEshifted = targCE >>> 32;
934         long patCEshifted = patCE >>> 32;
935         long mask;
936 
937         mask = 0xFFFF0000L;
938         int targLev1 = (int)(targCEshifted & mask);
939         int patLev1 = (int)(patCEshifted & mask);
940         if (targLev1 != patLev1) {
941             if (targLev1 == 0) {
942                 return CE_SKIP_TARG;
943             }
944             if (patLev1 == 0
945                     && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) {
946                 return CE_SKIP_PATN;
947             }
948             return CE_NO_MATCH;
949         }
950 
951         mask = 0x0000FFFFL;
952         int targLev2 = (int)(targCEshifted & mask);
953         int patLev2 = (int)(patCEshifted & mask);
954         if (targLev2 != patLev2) {
955             if (targLev2 == 0) {
956                 return CE_SKIP_TARG;
957             }
958             if (patLev2 == 0
959                     && compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD) {
960                 return CE_SKIP_PATN;
961             }
962             return (patLev2 == CE_LEVEL2_BASE ||
963                     (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD &&
964                         targLev2 == CE_LEVEL2_BASE)) ? CE_MATCH : CE_NO_MATCH;
965         }
966 
967         mask = 0xFFFF0000L;
968         int targLev3 = (int)(targCE & mask);
969         int patLev3 = (int)(patCE & mask);
970         if (targLev3 != patLev3) {
971             return (patLev3 == CE_LEVEL3_BASE ||
972                     (compareType == ElementComparisonType.ANY_BASE_WEIGHT_IS_WILDCARD &&
973                         targLev3 == CE_LEVEL3_BASE) )? CE_MATCH: CE_NO_MATCH;
974         }
975 
976         return CE_MATCH;
977     }
978 
979     /**
980      * An object used for receiving matched index in search() and
981      * searchBackwards().
982      */
983     private static class Match {
984         int start_ = -1;
985         int limit_ = -1;
986     }
987 
search(int startIdx, Match m)988     private boolean search(int startIdx, Match m) {
989         // Input parameter sanity check.
990         if (pattern_.CELength_ == 0
991                 || startIdx < search_.beginIndex()
992                 || startIdx > search_.endIndex()) {
993             throw new IllegalArgumentException("search(" + startIdx + ", m) - expected position to be between " +
994                     search_.beginIndex() + " and " + search_.endIndex());
995         }
996 
997         if (pattern_.PCE_ == null) {
998             initializePatternPCETable();
999         }
1000 
1001         textIter_.setOffset(startIdx);
1002         CEBuffer ceb = new CEBuffer(this);
1003 
1004         int targetIx = 0;
1005         CEI targetCEI = null;
1006         int patIx;
1007         boolean found;
1008 
1009         int mStart = -1;
1010         int mLimit = -1;
1011         int minLimit;
1012         int maxLimit;
1013 
1014         // Outer loop moves over match starting positions in the
1015         //      target CE space.
1016         // Here we see the target as a sequence of collation elements, resulting from the following:
1017         // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
1018         //    (for example, digraphs such as IJ may be broken into two characters).
1019         // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
1020         //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
1021         //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
1022         //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
1023         //    then the CE is deleted, so the following code sees only CEs that are relevant.
1024         // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
1025         // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
1026         // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
1027         for (targetIx = 0; ; targetIx++) {
1028             found = true;
1029             // Inner loop checks for a match beginning at each
1030             // position from the outer loop.
1031             int targetIxOffset = 0;
1032             long patCE = 0;
1033             // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
1034             // (compared to the last CE fetched for the previous targetIx value) as we need to go
1035             // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
1036             CEI firstCEI = ceb.get(targetIx);
1037             if (firstCEI == null) {
1038                 throw new ICUException("CEBuffer.get(" + targetIx + ") returned null.");
1039             }
1040 
1041             for (patIx = 0; patIx < pattern_.PCELength_; patIx++) {
1042                 patCE = pattern_.PCE_[patIx];
1043                 targetCEI = ceb.get(targetIx + patIx + targetIxOffset);
1044                 // Compare CE from target string with CE from the pattern.
1045                 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
1046                 // which will fail the compare, below.
1047                 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_);
1048                 if (ceMatch == CE_NO_MATCH) {
1049                     found = false;
1050                     break;
1051                 } else if (ceMatch > CE_NO_MATCH) {
1052                     if (ceMatch == CE_SKIP_TARG) {
1053                         // redo with same patCE, next targCE
1054                         patIx--;
1055                         targetIxOffset++;
1056                     } else { // ceMatch == CE_SKIP_PATN
1057                         // redo with same targCE, next patCE
1058                         targetIxOffset--;
1059                     }
1060                 }
1061             }
1062             targetIxOffset += pattern_.PCELength_; // this is now the offset in target CE space to end of the match so far
1063 
1064             if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) {
1065                 // No match at this targetIx.  Try again at the next.
1066                 continue;
1067             }
1068 
1069             if (!found) {
1070                 // No match at all, we have run off the end of the target text.
1071                 break;
1072             }
1073 
1074             // We have found a match in CE space.
1075             // Now determine the bounds in string index space.
1076             // There still is a chance of match failure if the CE range not correspond to
1077             // an acceptable character range.
1078             //
1079             CEI lastCEI = ceb.get(targetIx + targetIxOffset -1);
1080 
1081             mStart = firstCEI.lowIndex_;
1082             minLimit = lastCEI.lowIndex_;
1083 
1084             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
1085             // extended to the end of input, and the match is good.
1086 
1087             // Look at the high and low indices of the CE following the match. If
1088             // they are the same it means one of two things:
1089             //    1. The match extended to the last CE from the target text, which is OK, or
1090             //    2. The last CE that was part of the match is in an expansion that extends
1091             //       to the first CE after the match. In this case, we reject the match.
1092             CEI nextCEI = null;
1093             if (search_.elementComparisonType_ == ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
1094                 nextCEI = ceb.get(targetIx + targetIxOffset);
1095                 maxLimit = nextCEI.lowIndex_;
1096                 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) {
1097                     found = false;
1098                 }
1099             } else {
1100                 for (;; ++targetIxOffset) {
1101                     nextCEI = ceb.get(targetIx + targetIxOffset);
1102                     maxLimit = nextCEI.lowIndex_;
1103                     // If we are at the end of the target too, match succeeds
1104                     if (nextCEI.ce_ == CollationPCE.PROCESSED_NULLORDER) {
1105                         break;
1106                     }
1107                     // As long as the next CE has primary weight of 0,
1108                     // it is part of the last target element matched by the pattern;
1109                     // make sure it can be part of a match with the last patCE
1110                     if ((((nextCEI.ce_) >>> 32) & 0xFFFF0000L) == 0) {
1111                         int ceMatch = compareCE64s(nextCEI.ce_, patCE, search_.elementComparisonType_);
1112                         if (ceMatch == CE_NO_MATCH || ceMatch == CE_SKIP_PATN ) {
1113                             found = false;
1114                             break;
1115                         }
1116                     // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
1117                     // target element, but it has non-zero primary weight => match fails
1118                     } else if ( nextCEI.lowIndex_ == nextCEI.highIndex_ ) {
1119                         found = false;
1120                         break;
1121                     // Else the target CE is not part of an expansion of the last matched element, match succeeds
1122                     } else {
1123                         break;
1124                     }
1125                 }
1126             }
1127 
1128             // Check for the start of the match being within a combining sequence.
1129             // This can happen if the pattern itself begins with a combining char, and
1130             // the match found combining marks in the target text that were attached
1131             // to something else.
1132             // This type of match should be rejected for not completely consuming a
1133             // combining sequence.
1134             if (!isBreakBoundary(mStart)) {
1135                 found = false;
1136             }
1137 
1138             // Check for the start of the match being within an Collation Element Expansion,
1139             // meaning that the first char of the match is only partially matched.
1140             // With expansions, the first CE will report the index of the source
1141             // character, and all subsequent (expansions) CEs will report the source index of the
1142             // _following_ character.
1143             int secondIx = firstCEI.highIndex_;
1144             if (mStart == secondIx) {
1145                 found = false;
1146             }
1147 
1148             // Allow matches to end in the middle of a grapheme cluster if the following
1149             // conditions are met; this is needed to make prefix search work properly in
1150             // Indic, see #11750
1151             // * the default breakIter is being used
1152             // * the next collation element after this combining sequence
1153             //   - has non-zero primary weight
1154             //   - corresponds to a separate character following the one at end of the current match
1155             //   (the second of these conditions, and perhaps both, may be redundant given the
1156             //   subsequent check for normalization boundary; however they are likely much faster
1157             //   tests in any case)
1158             // * the match limit is a normalization boundary
1159             boolean allowMidclusterMatch =
1160                             breakIterator == null &&
1161                             (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 &&
1162                             maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit &&
1163                             (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) ||
1164                                     nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit)));
1165 
1166             // If those conditions are met, then:
1167             // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
1168             //   the match limit may be backed off to a previous break boundary. This handles
1169             //   cases in which mLimit includes target characters that are ignorable with current
1170             //   settings (such as space) and which extend beyond the pattern match.
1171             // * do NOT require that end of the combining sequence not extend beyond the match in CE space
1172             // * do NOT require that match limit be on a breakIter boundary
1173 
1174             // Advance the match end position to the first acceptable match boundary.
1175             // This advances the index over any combining characters.
1176             mLimit = maxLimit;
1177             if (minLimit < maxLimit) {
1178                 // When the last CE's low index is same with its high index, the CE is likely
1179                 // a part of expansion. In this case, the index is located just after the
1180                 // character corresponding to the CEs compared above. If the index is right
1181                 // at the break boundary, move the position to the next boundary will result
1182                 // incorrect match length when there are ignorable characters exist between
1183                 // the position and the next character produces CE(s). See ticket#8482.
1184                 if (minLimit == lastCEI.highIndex_ && isBreakBoundary(minLimit)) {
1185                     mLimit = minLimit;
1186                 } else {
1187                     int nba = nextBoundaryAfter(minLimit);
1188                     // Note that we can have nba < maxLimit && nba >= minLImit, in which
1189                     // case we want to set mLimit to nba regardless of allowMidclusterMatch
1190                     // (i.e. we back off mLimit to the previous breakIterator boundary).
1191                     if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) {
1192                         mLimit = nba;
1193                     }
1194                 }
1195             }
1196 
1197             if (!allowMidclusterMatch) {
1198                 // If advancing to the end of a combining sequence in character indexing space
1199                 // advanced us beyond the end of the match in CE space, reject this match.
1200                 if (mLimit > maxLimit) {
1201                     found = false;
1202                 }
1203 
1204                 if (!isBreakBoundary(mLimit)) {
1205                     found = false;
1206                 }
1207             }
1208 
1209             if (!checkIdentical(mStart, mLimit)) {
1210                 found = false;
1211             }
1212 
1213             if (found) {
1214                 break;
1215             }
1216         }
1217 
1218         // All Done.  Store back the match bounds to the caller.
1219         //
1220         if (found == false) {
1221             mLimit = -1;
1222             mStart = -1;
1223         }
1224 
1225         if (m != null) {
1226             m.start_ = mStart;
1227             m.limit_ = mLimit;
1228         }
1229 
1230         return found;
1231     }
1232 
codePointAt(CharacterIterator iter, int index)1233     private static int codePointAt(CharacterIterator iter, int index) {
1234         int currentIterIndex = iter.getIndex();
1235         char codeUnit = iter.setIndex(index);
1236         int cp = codeUnit;
1237         if (Character.isHighSurrogate(codeUnit)) {
1238             char nextUnit = iter.next();
1239             if (Character.isLowSurrogate(nextUnit)) {
1240                 cp = Character.toCodePoint(codeUnit, nextUnit);
1241             }
1242         }
1243         iter.setIndex(currentIterIndex);  // restore iter position
1244         return cp;
1245     }
1246 
codePointBefore(CharacterIterator iter, int index)1247     private static int codePointBefore(CharacterIterator iter, int index) {
1248         int currentIterIndex = iter.getIndex();
1249         iter.setIndex(index);
1250         char codeUnit = iter.previous();
1251         int cp = codeUnit;
1252         if (Character.isLowSurrogate(codeUnit)) {
1253             char prevUnit = iter.previous();
1254             if (Character.isHighSurrogate(prevUnit)) {
1255                 cp = Character.toCodePoint(prevUnit, codeUnit);
1256             }
1257         }
1258         iter.setIndex(currentIterIndex);  // restore iter position
1259         return cp;
1260     }
1261 
searchBackwards(int startIdx, Match m)1262     private boolean searchBackwards(int startIdx, Match m) {
1263         //ICU4C_TODO comment:  reject search patterns beginning with a combining char.
1264 
1265         // Input parameter sanity check.
1266         if (pattern_.CELength_ == 0
1267                 || startIdx < search_.beginIndex()
1268                 || startIdx > search_.endIndex()) {
1269             throw new IllegalArgumentException("searchBackwards(" + startIdx + ", m) - expected position to be between " +
1270                     search_.beginIndex() + " and " + search_.endIndex());
1271         }
1272 
1273         if (pattern_.PCE_ == null) {
1274             initializePatternPCETable();
1275         }
1276 
1277         CEBuffer ceb = new CEBuffer(this);
1278         int targetIx = 0;
1279 
1280         /*
1281          * Pre-load the buffer with the CE's for the grapheme
1282          * after our starting position so that we're sure that
1283          * we can look at the CE following the match when we
1284          * check the match boundaries.
1285          *
1286          * This will also pre-fetch the first CE that we'll
1287          * consider for the match.
1288          */
1289         if (startIdx < search_.endIndex()) {
1290             BreakIterator bi = search_.internalBreakIter_;
1291             int next = bi.following(startIdx);
1292 
1293             textIter_.setOffset(next);
1294 
1295             for (targetIx = 0; ; targetIx++) {
1296                 if (ceb.getPrevious(targetIx).lowIndex_ < startIdx) {
1297                     break;
1298                 }
1299             }
1300         } else {
1301             textIter_.setOffset(startIdx);
1302         }
1303 
1304         CEI targetCEI = null;
1305         int patIx;
1306         boolean found;
1307 
1308         int limitIx = targetIx;
1309         int mStart = -1;
1310         int mLimit = -1;
1311         int minLimit;
1312         int maxLimit;
1313 
1314         // Outer loop moves over match starting positions in the
1315         //      target CE space.
1316         // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
1317         // But  patIx is 0 at the beginning of the pattern and increases toward the end.
1318         // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
1319         // and the beginning of the base text.
1320         for (targetIx = limitIx; ; targetIx++) {
1321             found = true;
1322             // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
1323             // (compared to the last CE fetched for the previous targetIx value) as we need to go
1324             // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
1325             CEI lastCEI = ceb.getPrevious(targetIx);
1326             if (lastCEI == null) {
1327                 throw new ICUException("CEBuffer.getPrevious(" + targetIx + ") returned null.");
1328             }
1329             // Inner loop checks for a match beginning at each
1330             // position from the outer loop.
1331             int targetIxOffset = 0;
1332             for (patIx = pattern_.PCELength_ - 1; patIx >= 0; patIx--) {
1333                 long patCE = pattern_.PCE_[patIx];
1334 
1335                 targetCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 - patIx + targetIxOffset);
1336                 // Compare CE from target string with CE from the pattern.
1337                 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
1338                 // which will fail the compare, below.
1339                 int ceMatch = compareCE64s(targetCEI.ce_, patCE, search_.elementComparisonType_);
1340                 if (ceMatch == CE_NO_MATCH) {
1341                     found = false;
1342                     break;
1343                 } else if (ceMatch > CE_NO_MATCH) {
1344                     if (ceMatch == CE_SKIP_TARG) {
1345                         // redo with same patCE, next targCE
1346                         patIx++;
1347                         targetIxOffset++;
1348                     } else { // ceMatch == CE_SKIP_PATN
1349                         // redo with same targCE, next patCE
1350                         targetIxOffset--;
1351                     }
1352                 }
1353             }
1354 
1355             if (!found && ((targetCEI == null) || (targetCEI.ce_ != CollationPCE.PROCESSED_NULLORDER))) {
1356                 // No match at this targetIx.  Try again at the next.
1357                 continue;
1358             }
1359 
1360             if (!found) {
1361                 // No match at all, we have run off the end of the target text.
1362                 break;
1363             }
1364 
1365             // We have found a match in CE space.
1366             // Now determine the bounds in string index space.
1367             // There still is a chance of match failure if the CE range not correspond to
1368             // an acceptable character range.
1369             //
1370             CEI firstCEI = ceb.getPrevious(targetIx + pattern_.PCELength_ - 1 + targetIxOffset);
1371             mStart = firstCEI.lowIndex_;
1372 
1373             // Check for the start of the match being within a combining sequence.
1374             // This can happen if the pattern itself begins with a combining char, and
1375             // the match found combining marks in the target text that were attached
1376             // to something else.
1377             // This type of match should be rejected for not completely consuming a
1378             // combining sequence.
1379             if (!isBreakBoundary(mStart)) {
1380                 found = false;
1381             }
1382 
1383             // Look at the high index of the first CE in the match. If it's the same as the
1384             // low index, the first CE in the match is in the middle of an expansion.
1385             if (mStart == firstCEI.highIndex_) {
1386                 found = false;
1387             }
1388 
1389             minLimit = lastCEI.lowIndex_;
1390 
1391             if (targetIx > 0) {
1392                 // Look at the CE following the match.  If it is UCOL_NULLORDER the match
1393                 // extended to the end of input, and the match is good.
1394 
1395                 // Look at the high and low indices of the CE following the match. If
1396                 // they are the same it means one of two things:
1397                 //    1. The match extended to the last CE from the target text, which is OK, or
1398                 //    2. The last CE that was part of the match is in an expansion that extends
1399                 //       to the first CE after the match. In this case, we reject the match.
1400                 CEI nextCEI  = ceb.getPrevious(targetIx - 1);
1401 
1402                 if (nextCEI.lowIndex_ == nextCEI.highIndex_ && nextCEI.ce_ != CollationPCE.PROCESSED_NULLORDER) {
1403                     found = false;
1404                 }
1405 
1406                 mLimit = maxLimit = nextCEI.lowIndex_;
1407 
1408                 // Allow matches to end in the middle of a grapheme cluster if the following
1409                 // conditions are met; this is needed to make prefix search work properly in
1410                 // Indic, see #11750
1411                 // * the default breakIter is being used
1412                 // * the next collation element after this combining sequence
1413                 //   - has non-zero primary weight
1414                 //   - corresponds to a separate character following the one at end of the current match
1415                 //   (the second of these conditions, and perhaps both, may be redundant given the
1416                 //   subsequent check for normalization boundary; however they are likely much faster
1417                 //   tests in any case)
1418                 // * the match limit is a normalization boundary
1419                 boolean allowMidclusterMatch =
1420                                 breakIterator == null &&
1421                                 (((nextCEI.ce_) >>> 32) & 0xFFFF0000L) != 0 &&
1422                                 maxLimit >= lastCEI.highIndex_ && nextCEI.highIndex_ > maxLimit &&
1423                                 (nfd_.hasBoundaryBefore(codePointAt(targetText, maxLimit)) ||
1424                                         nfd_.hasBoundaryAfter(codePointBefore(targetText, maxLimit)));
1425 
1426                 // If those conditions are met, then:
1427                 // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
1428                 //   the match limit may be backed off to a previous break boundary. This handles
1429                 //   cases in which mLimit includes target characters that are ignorable with current
1430                 //   settings (such as space) and which extend beyond the pattern match.
1431                 // * do NOT require that end of the combining sequence not extend beyond the match in CE space
1432                 // * do NOT require that match limit be on a breakIter boundary
1433 
1434                 // Advance the match end position to the first acceptable match boundary.
1435                 // This advances the index over any combining charcters.
1436                 if (minLimit < maxLimit) {
1437                     int nba = nextBoundaryAfter(minLimit);
1438                     // Note that we can have nba < maxLimit && nba >= minLImit, in which
1439                     // case we want to set mLimit to nba regardless of allowMidclusterMatch
1440                     // (i.e. we back off mLimit to the previous breakIterator boundary).
1441                     if (nba >= lastCEI.highIndex_ && (!allowMidclusterMatch || nba < maxLimit)) {
1442                         mLimit = nba;
1443                     }
1444                 }
1445 
1446                 if (!allowMidclusterMatch) {
1447                     // If advancing to the end of a combining sequence in character indexing space
1448                     // advanced us beyond the end of the match in CE space, reject this match.
1449                     if (mLimit > maxLimit) {
1450                         found = false;
1451                     }
1452 
1453                     // Make sure the end of the match is on a break boundary
1454                     if (!isBreakBoundary(mLimit)) {
1455                         found = false;
1456                     }
1457                 }
1458 
1459             } else {
1460                 // No non-ignorable CEs after this point.
1461                 // The maximum position is detected by boundary after
1462                 // the last non-ignorable CE. Combining sequence
1463                 // across the start index will be truncated.
1464                 int nba = nextBoundaryAfter(minLimit);
1465                 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
1466             }
1467 
1468             if (!checkIdentical(mStart, mLimit)) {
1469                 found = false;
1470             }
1471 
1472             if (found) {
1473                 break;
1474             }
1475         }
1476 
1477         // All Done.  Store back the match bounds to the caller.
1478         //
1479         if (found == false) {
1480             mLimit = -1;
1481             mStart = -1;
1482         }
1483 
1484         if (m != null) {
1485             m.start_ = mStart;
1486             m.limit_ = mLimit;
1487         }
1488 
1489         return found;
1490     }
1491 
1492     // Java porting note:
1493     //
1494     // ICU4C usearch_handleNextExact() is identical to usearch_handleNextCanonical()
1495     // for the linear search implementation. The differences are addressed in search().
1496     //
handleNextExact()1497     private boolean handleNextExact() {
1498         return handleNextCommonImpl();
1499     }
1500 
handleNextCanonical()1501     private boolean handleNextCanonical() {
1502         return handleNextCommonImpl();
1503     }
1504 
handleNextCommonImpl()1505     private boolean handleNextCommonImpl() {
1506         int textOffset = textIter_.getOffset();
1507         Match match = new Match();
1508 
1509         if (search(textOffset, match)) {
1510             search_.matchedIndex_ = match.start_;
1511             search_.setMatchedLength(match.limit_ - match.start_);
1512             return true;
1513         } else {
1514             setMatchNotFound();
1515             return false;
1516         }
1517     }
1518 
1519     // Java porting note:
1520     //
1521     // ICU4C usearch_handlePreviousExact() is identical to usearch_handlePreviousCanonical()
1522     // for the linear search implementation. The differences are addressed in searchBackwards().
1523     //
handlePreviousExact()1524     private boolean handlePreviousExact() {
1525         return handlePreviousCommonImpl();
1526     }
1527 
handlePreviousCanonical()1528     private boolean handlePreviousCanonical() {
1529         return handlePreviousCommonImpl();
1530     }
1531 
handlePreviousCommonImpl()1532     private boolean handlePreviousCommonImpl() {
1533         int textOffset;
1534 
1535         if (search_.isOverlap_) {
1536             if (search_.matchedIndex_ != DONE) {
1537                 textOffset = search_.matchedIndex_ + search_.matchedLength() - 1;
1538             } else {
1539                 // move the start position at the end of possible match
1540                 initializePatternPCETable();
1541                 if (!initTextProcessedIter()) {
1542                     setMatchNotFound();
1543                     return false;
1544                 }
1545                 for (int nPCEs = 0; nPCEs < pattern_.PCELength_ - 1; nPCEs++) {
1546                     long pce = textProcessedIter_.nextProcessed(null);
1547                     if (pce == CollationPCE.PROCESSED_NULLORDER) {
1548                         // at the end of the text
1549                         break;
1550                     }
1551                 }
1552                 textOffset = textIter_.getOffset();
1553             }
1554         } else {
1555             textOffset = textIter_.getOffset();
1556         }
1557 
1558         Match match = new Match();
1559         if (searchBackwards(textOffset, match)) {
1560             search_.matchedIndex_ = match.start_;
1561             search_.setMatchedLength(match.limit_ - match.start_);
1562             return true;
1563         } else {
1564             setMatchNotFound();
1565             return false;
1566         }
1567     }
1568 
1569     /**
1570      * Gets a substring out of a CharacterIterator
1571      *
1572      * Java porting note: Not available in ICU4C
1573      *
1574      * @param text CharacterIterator
1575      * @param start start offset
1576      * @param length of substring
1577      * @return substring from text starting at start and length length
1578      */
getString(CharacterIterator text, int start, int length)1579     private static final String getString(CharacterIterator text, int start, int length) {
1580         StringBuilder result = new StringBuilder(length);
1581         int offset = text.getIndex();
1582         text.setIndex(start);
1583         for (int i = 0; i < length; i++) {
1584             result.append(text.current());
1585             text.next();
1586         }
1587         text.setIndex(offset);
1588         return result.toString();
1589     }
1590 
1591     /**
1592      * Java port of ICU4C struct UPattern (usrchimp.h)
1593      */
1594     private static final class Pattern {
1595         /** Pattern string */
1596         String text_;
1597 
1598         long[] PCE_;
1599         int PCELength_ = 0;
1600 
1601         // TODO: We probably do not need CE_ / CELength_
1602         @SuppressWarnings("unused")
1603         int[] CE_;
1604         int CELength_ = 0;
1605 
1606         // *** Boyer-Moore ***
1607         // boolean hasPrefixAccents_ = false;
1608         // boolean hasSuffixAccents_ = false;
1609         // int defaultShiftSize_;
1610         // char[] shift_;
1611         // char[] backShift_;
1612 
Pattern(String pattern)1613         protected Pattern(String pattern) {
1614             text_ = pattern;
1615         }
1616     }
1617 
1618     /**
1619      * Java port of ICU4C UCollationPCE (usrchimp.h)
1620      */
1621     private static class CollationPCE {
1622         public static final long PROCESSED_NULLORDER = -1;
1623 
1624         private static final int DEFAULT_BUFFER_SIZE = 16;
1625         private static final int BUFFER_GROW = 8;
1626 
1627         // Note: PRIMARYORDERMASK is also duplicated in StringSearch class
1628         private static final int PRIMARYORDERMASK = 0xffff0000;
1629         private static final int CONTINUATION_MARKER = 0xc0;
1630 
1631         private PCEBuffer pceBuffer_ = new PCEBuffer();
1632         private CollationElementIterator cei_;
1633         private int strength_;
1634         private boolean toShift_;
1635         private boolean isShifted_;
1636         private int variableTop_;
1637 
CollationPCE(CollationElementIterator iter)1638         public CollationPCE(CollationElementIterator iter) {
1639             init(iter);
1640         }
1641 
init(CollationElementIterator iter)1642         public void init(CollationElementIterator iter) {
1643             cei_ = iter;
1644             init(iter.getRuleBasedCollator());
1645         }
1646 
init(RuleBasedCollator coll)1647         private void init(RuleBasedCollator coll) {
1648             strength_ = coll.getStrength();
1649             toShift_ = coll.isAlternateHandlingShifted();
1650             isShifted_ = false;
1651             variableTop_ = coll.getVariableTop();
1652         }
1653 
1654         @SuppressWarnings("fallthrough")
processCE(int ce)1655         private long processCE(int ce) {
1656             long primary = 0, secondary = 0, tertiary = 0, quaternary = 0;
1657 
1658             // This is clean, but somewhat slow...
1659             // We could apply the mask to ce and then
1660             // just get all three orders...
1661             switch (strength_) {
1662             default:
1663                 tertiary = CollationElementIterator.tertiaryOrder(ce);
1664                 /* note fall-through */
1665 
1666             case Collator.SECONDARY:
1667                 secondary = CollationElementIterator.secondaryOrder(ce);
1668                 /* note fall-through */
1669 
1670             case Collator.PRIMARY:
1671                 primary = CollationElementIterator.primaryOrder(ce);
1672             }
1673 
1674             // **** This should probably handle continuations too. ****
1675             // **** That means that we need 24 bits for the primary ****
1676             // **** instead of the 16 that we're currently using. ****
1677             // **** So we can lay out the 64 bits as: 24.12.12.16. ****
1678             // **** Another complication with continuations is that ****
1679             // **** the *second* CE is marked as a continuation, so ****
1680             // **** we always have to peek ahead to know how long ****
1681             // **** the primary is... ****
1682             if ((toShift_ && variableTop_ > ce && primary != 0) || (isShifted_ && primary == 0)) {
1683 
1684                 if (primary == 0) {
1685                     return CollationElementIterator.IGNORABLE;
1686                 }
1687 
1688                 if (strength_ >= Collator.QUATERNARY) {
1689                     quaternary = primary;
1690                 }
1691 
1692                 primary = secondary = tertiary = 0;
1693                 isShifted_ = true;
1694             } else {
1695                 if (strength_ >= Collator.QUATERNARY) {
1696                     quaternary = 0xFFFF;
1697                 }
1698 
1699                 isShifted_ = false;
1700             }
1701 
1702             return primary << 48 | secondary << 32 | tertiary << 16 | quaternary;
1703         }
1704 
1705         /**
1706          * Get the processed ordering priority of the next collation element in the text.
1707          * A single character may contain more than one collation element.
1708          *
1709          * Note: This is equivalent to
1710          * UCollationPCE::nextProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
1711          *
1712          * @param range receiving the iterator index before/after fetching the CE.
1713          * @return The next collation elements ordering, otherwise returns PROCESSED_NULLORDER
1714          *         if an error has occurred or if the end of string has been reached
1715          */
nextProcessed(Range range)1716         public long nextProcessed(Range range) {
1717             long result = CollationElementIterator.IGNORABLE;
1718             int low = 0, high = 0;
1719 
1720             pceBuffer_.reset();
1721 
1722             do {
1723                 low = cei_.getOffset();
1724                 int ce = cei_.next();
1725                 high = cei_.getOffset();
1726 
1727                 if (ce == CollationElementIterator.NULLORDER) {
1728                      result = PROCESSED_NULLORDER;
1729                      break;
1730                 }
1731 
1732                 result = processCE(ce);
1733             } while (result == CollationElementIterator.IGNORABLE);
1734 
1735             if (range != null) {
1736                 range.ixLow_ = low;
1737                 range.ixHigh_ = high;
1738             }
1739 
1740             return result;
1741         }
1742 
1743         /**
1744          * Get the processed ordering priority of the previous collation element in the text.
1745          * A single character may contain more than one collation element.
1746          *
1747          * Note: This is equivalent to
1748          * UCollationPCE::previousProcessed(int32_t *ixLow, int32_t *ixHigh, UErrorCode *status);
1749          *
1750          * @param range receiving the iterator index before/after fetching the CE.
1751          * @return The previous collation elements ordering, otherwise returns
1752          *         PROCESSED_NULLORDER if an error has occurred or if the start of
1753          *         string has been reached.
1754          */
previousProcessed(Range range)1755         public long previousProcessed(Range range) {
1756             long result = CollationElementIterator.IGNORABLE;
1757             int low = 0, high = 0;
1758 
1759             // pceBuffer_.reset();
1760 
1761             while (pceBuffer_.empty()) {
1762                 // buffer raw CEs up to non-ignorable primary
1763                 RCEBuffer rceb = new RCEBuffer();
1764                 int ce;
1765 
1766                 boolean finish = false;
1767 
1768                 // **** do we need to reset rceb, or will it always be empty at this point ****
1769                 do {
1770                     high = cei_.getOffset();
1771                     ce = cei_.previous();
1772                     low = cei_.getOffset();
1773 
1774                     if (ce == CollationElementIterator.NULLORDER) {
1775                         if (!rceb.empty()) {
1776                             break;
1777                         }
1778 
1779                         finish = true;
1780                         break;
1781                     }
1782 
1783                     rceb.put(ce, low, high);
1784                 } while ((ce & PRIMARYORDERMASK) == 0 || isContinuation(ce));
1785 
1786                 if (finish) {
1787                     break;
1788                 }
1789 
1790                 // process the raw CEs
1791                 while (!rceb.empty()) {
1792                     RCEI rcei = rceb.get();
1793 
1794                     result = processCE(rcei.ce_);
1795 
1796                     if (result != CollationElementIterator.IGNORABLE) {
1797                         pceBuffer_.put(result, rcei.low_, rcei.high_);
1798                     }
1799                 }
1800             }
1801 
1802             if (pceBuffer_.empty()) {
1803                 // **** Is -1 the right value for ixLow, ixHigh? ****
1804                 if (range != null) {
1805                     range.ixLow_ = -1;
1806                     range.ixHigh_ = -1;
1807                 }
1808                 return CollationElementIterator.NULLORDER;
1809             }
1810 
1811             PCEI pcei = pceBuffer_.get();
1812 
1813             if (range != null) {
1814                 range.ixLow_ = pcei.low_;
1815                 range.ixHigh_ = pcei.high_;
1816             }
1817 
1818             return pcei.ce_;
1819         }
1820 
isContinuation(int ce)1821         private static boolean isContinuation(int ce) {
1822             return ((ce & CONTINUATION_MARKER) == CONTINUATION_MARKER);
1823         }
1824 
1825         /**
1826          * @hide exposed on OHOS
1827          */
1828         public static final class Range {
1829             int ixLow_;
1830             int ixHigh_;
1831         }
1832 
1833         /** Processed collation element buffer stuff ported from ICU4C ucoleitr.cpp */
1834         private static final class PCEI {
1835             long ce_;
1836             int low_;
1837             int high_;
1838         }
1839 
1840         private static final class PCEBuffer {
1841             private PCEI[] buffer_ = new PCEI[DEFAULT_BUFFER_SIZE];
1842             private int bufferIndex_ = 0;
1843 
reset()1844             void reset() {
1845                 bufferIndex_ = 0;
1846             }
1847 
empty()1848             boolean empty() {
1849                 return bufferIndex_ <= 0;
1850             }
1851 
put(long ce, int ixLow, int ixHigh)1852             void put(long ce, int ixLow, int ixHigh)
1853             {
1854                 if (bufferIndex_ >= buffer_.length) {
1855                     PCEI[] newBuffer = new PCEI[buffer_.length + BUFFER_GROW];
1856                     System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length);
1857                     buffer_ = newBuffer;
1858                 }
1859                 buffer_[bufferIndex_] = new PCEI();
1860                 buffer_[bufferIndex_].ce_ = ce;
1861                 buffer_[bufferIndex_].low_ = ixLow;
1862                 buffer_[bufferIndex_].high_ = ixHigh;
1863 
1864                 bufferIndex_ += 1;
1865             }
1866 
get()1867             PCEI get() {
1868                 if (bufferIndex_ > 0) {
1869                     return buffer_[--bufferIndex_];
1870                 }
1871                 return null;
1872             }
1873         }
1874 
1875         /** Raw collation element buffer stuff ported from ICU4C ucoleitr.cpp */
1876         private static final class RCEI {
1877             int ce_;
1878             int low_;
1879             int high_;
1880         }
1881 
1882         private static final class RCEBuffer {
1883             private RCEI[] buffer_ = new RCEI[DEFAULT_BUFFER_SIZE];
1884             private int bufferIndex_ = 0;
1885 
empty()1886             boolean empty() {
1887                 return bufferIndex_ <= 0;
1888             }
1889 
put(int ce, int ixLow, int ixHigh)1890             void put(int ce, int ixLow, int ixHigh) {
1891                 if (bufferIndex_ >= buffer_.length) {
1892                     RCEI[] newBuffer = new RCEI[buffer_.length + BUFFER_GROW];
1893                     System.arraycopy(buffer_, 0, newBuffer, 0, buffer_.length);
1894                     buffer_ = newBuffer;
1895                 }
1896                 buffer_[bufferIndex_] = new RCEI();
1897                 buffer_[bufferIndex_].ce_ = ce;
1898                 buffer_[bufferIndex_].low_ = ixLow;
1899                 buffer_[bufferIndex_].high_ = ixHigh;
1900 
1901                 bufferIndex_ += 1;
1902             }
1903 
get()1904             RCEI get() {
1905                 if (bufferIndex_ > 0) {
1906                     return buffer_[--bufferIndex_];
1907                 }
1908                 return null;
1909             }
1910         }
1911     }
1912 
1913     /**
1914      * Java port of ICU4C CEI (usearch.cpp)
1915      *
1916      * CEI  Collation Element + source text index.
1917      *      These structs are kept in the circular buffer.
1918      */
1919     private static class CEI {
1920         long ce_;
1921         int lowIndex_;
1922         int highIndex_;
1923     }
1924 
1925     /**
1926      * CEBuffer A circular buffer of CEs from the text being searched
1927      */
1928     private static class CEBuffer {
1929         // Java porting note: ICU4C uses the size for stack buffer
1930         // static final int DEFAULT_CEBUFFER_SIZE = 96;
1931 
1932         static final int CEBUFFER_EXTRA = 32;
1933         static final int MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L = 8;
1934         static final int MAX_TARGET_IGNORABLES_PER_PAT_OTHER = 3;
1935 
1936         CEI[] buf_;
1937         int bufSize_;
1938         int firstIx_;
1939         int limitIx_;
1940 
1941         // Java porting note: No references in ICU4C implementation
1942         // CollationElementIterator ceIter_;
1943 
1944         StringSearch strSearch_;
1945 
CEBuffer(StringSearch ss)1946         CEBuffer(StringSearch ss) {
1947             strSearch_ = ss;
1948             bufSize_ = ss.pattern_.PCELength_ + CEBUFFER_EXTRA;
1949             if (ss.search_.elementComparisonType_ != ElementComparisonType.STANDARD_ELEMENT_COMPARISON) {
1950                 String patText = ss.pattern_.text_;
1951                 if (patText != null) {
1952                     for (int i = 0; i < patText.length(); i++) {
1953                         char c = patText.charAt(i);
1954                         if (MIGHT_BE_JAMO_L(c)) {
1955                             bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
1956                         } else {
1957                             // No check for surrogates, we might allocate slightly more buffer than necessary.
1958                             bufSize_ += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
1959                         }
1960                     }
1961                 }
1962             }
1963 
1964             // Not used - see above
1965             // ceIter_ = ss.textIter_;
1966 
1967             firstIx_ = 0;
1968             limitIx_ = 0;
1969 
1970             if (!ss.initTextProcessedIter()) {
1971                 return;
1972             }
1973 
1974             buf_ = new CEI[bufSize_];
1975         }
1976 
1977         // Get the CE with the specified index.
1978         //   Index must be in the range
1979         //             n-history_size < index < n+1
1980         //   where n is the largest index to have been fetched by some previous call to this function.
1981         //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
1982         //
get(int index)1983         CEI get(int index) {
1984             int i = index % bufSize_;
1985 
1986             if (index >= firstIx_ && index < limitIx_) {
1987                 // The request was for an entry already in our buffer.
1988                 // Just return it.
1989                 return buf_[i];
1990             }
1991 
1992             // Caller is requesting a new, never accessed before, CE.
1993             // Verify that it is the next one in sequence, which is all
1994             // that is allowed.
1995             if (index != limitIx_) {
1996                 assert(false);
1997                 return null;
1998             }
1999 
2000             // Manage the circular CE buffer indexing
2001             limitIx_++;
2002 
2003             if (limitIx_ - firstIx_ >= bufSize_) {
2004                 // The buffer is full, knock out the lowest-indexed entry.
2005                 firstIx_++;
2006             }
2007 
2008             CollationPCE.Range range = new CollationPCE.Range();
2009             if (buf_[i] == null) {
2010                 buf_[i] = new CEI();
2011             }
2012             buf_[i].ce_ = strSearch_.textProcessedIter_.nextProcessed(range);
2013             buf_[i].lowIndex_ = range.ixLow_;
2014             buf_[i].highIndex_ = range.ixHigh_;
2015 
2016             return buf_[i];
2017         }
2018 
2019         // Get the CE with the specified index.
2020         //   Index must be in the range
2021         //             n-history_size < index < n+1
2022         //   where n is the largest index to have been fetched by some previous call to this function.
2023         //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
2024         //
getPrevious(int index)2025         CEI getPrevious(int index) {
2026             int i = index % bufSize_;
2027 
2028             if (index >= firstIx_ && index < limitIx_) {
2029                 // The request was for an entry already in our buffer.
2030                 // Just return it.
2031                 return buf_[i];
2032             }
2033 
2034             // Caller is requesting a new, never accessed before, CE.
2035             // Verify that it is the next one in sequence, which is all
2036             // that is allowed.
2037             if (index != limitIx_) {
2038                 assert(false);
2039                 return null;
2040             }
2041 
2042             // Manage the circular CE buffer indexing
2043             limitIx_++;
2044 
2045             if (limitIx_ - firstIx_ >= bufSize_) {
2046                 // The buffer is full, knock out the lowest-indexed entry.
2047                 firstIx_++;
2048             }
2049 
2050             CollationPCE.Range range = new CollationPCE.Range();
2051             if (buf_[i] == null) {
2052                 buf_[i] = new CEI();
2053             }
2054             buf_[i].ce_ = strSearch_.textProcessedIter_.previousProcessed(range);
2055             buf_[i].lowIndex_ = range.ixLow_;
2056             buf_[i].highIndex_ = range.ixHigh_;
2057 
2058             return buf_[i];
2059         }
2060 
MIGHT_BE_JAMO_L(char c)2061         static boolean MIGHT_BE_JAMO_L(char c) {
2062             return (c >= 0x1100 && c <= 0x115E)
2063                     || (c >= 0x3131 && c <= 0x314E)
2064                     || (c >= 0x3165 && c <= 0x3186);
2065         }
2066     }
2067 }
2068