• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  ******************************************************************************
6  *
7  *   Copyright (C) 2009-2015, International Business Machines
8  *   Corporation and others.  All Rights Reserved.
9  *
10  ******************************************************************************
11  */
12 
13 package android.icu.impl;
14 
15 import java.util.ArrayList;
16 
17 import android.icu.text.UnicodeSet;
18 import android.icu.text.UnicodeSet.SpanCondition;
19 import android.icu.util.OutputInt;
20 
21 /*
22  * Implement span() etc. for a set with strings.
23  * Avoid recursion because of its exponential complexity.
24  * Instead, try multiple paths at once and track them with an IndexList.
25  */
26 /**
27  * @hide Only a subset of ICU is exposed in Android
28  */
29 public class UnicodeSetStringSpan {
30 
31     /*
32      * Which span() variant will be used? The object is either built for one variant and used once,
33      * or built for all and may be used many times.
34      */
35     public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
36     public static final int FWD           = 0x20;
37     public static final int BACK          = 0x10;
38     // public static final int UTF16      = 8;
39     public static final int CONTAINED     = 2;
40     public static final int NOT_CONTAINED = 1;
41 
42     public static final int ALL = 0x7f;
43 
44     public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
45     public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
46     public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
47     public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
48 
49     /**
50      * Special spanLength short values. (since Java has not unsigned byte type)
51      * All code points in the string are contained in the parent set.
52      */
53     static final short ALL_CP_CONTAINED = 0xff;
54     /** The spanLength is >=0xfe. */
55     static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
56 
57     /** Set for span(). Same as parent but without strings. */
58     private UnicodeSet spanSet;
59 
60     /**
61      * Set for span(not contained).
62      * Same as spanSet, plus characters that start or end strings.
63      */
64     private UnicodeSet spanNotSet;
65 
66     /** The strings of the parent set. */
67     private ArrayList<String> strings;
68 
69     /** The lengths of span(), spanBack() etc. for each string. */
70     private short[] spanLengths;
71 
72     /** Maximum lengths of relevant strings. */
73     private final int maxLength16;
74 
75     /** Are there strings that are not fully contained in the code point set? */
76     private boolean someRelevant;
77 
78     /** Set up for all variants of span()? */
79     private boolean all;
80 
81     /** Span helper */
82     private OffsetList offsets;
83 
84     /**
85      * Constructs for all variants of span(), or only for any one variant.
86      * Initializes as little as possible, for single use.
87      */
UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which)88     public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
89         spanSet = new UnicodeSet(0, 0x10ffff);
90         // TODO: With Java 6, just take the parent set's strings as is,
91         // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
92         // Then iterate via the first() and higher() methods.
93         // (We do not want to create multiple Iterator objects in each span().)
94         // See ICU ticket #7454.
95         strings = setStrings;
96         all = (which == ALL);
97         spanSet.retainAll(set);
98         if (0 != (which & NOT_CONTAINED)) {
99             // Default to the same sets.
100             // addToSpanNotSet() will create a separate set if necessary.
101             spanNotSet = spanSet;
102         }
103         offsets = new OffsetList();
104 
105         // Determine if the strings even need to be taken into account at all for span() etc.
106         // If any string is relevant, then all strings need to be used for
107         // span(longest match) but only the relevant ones for span(while contained).
108         // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
109         // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
110         // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
111         // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
112         int stringsLength = strings.size();
113 
114         int i, spanLength;
115         int maxLength16 = 0;
116         someRelevant = false;
117         for (i = 0; i < stringsLength; ++i) {
118             String string = strings.get(i);
119             int length16 = string.length();
120             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
121             if (spanLength < length16) { // Relevant string.
122                 someRelevant = true;
123             }
124             if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
125                 maxLength16 = length16;
126             }
127         }
128         this.maxLength16 = maxLength16;
129         if (!someRelevant && (which & WITH_COUNT) == 0) {
130             return;
131         }
132 
133         // Freeze after checking for the need to use strings at all because freezing
134         // a set takes some time and memory which are wasted if there are no relevant strings.
135         if (all) {
136             spanSet.freeze();
137         }
138 
139         int spanBackLengthsOffset;
140 
141         // Allocate a block of meta data.
142         int allocSize;
143         if (all) {
144             // 2 sets of span lengths
145             allocSize = stringsLength * (2);
146         } else {
147             allocSize = stringsLength; // One set of span lengths.
148         }
149         spanLengths = new short[allocSize];
150 
151         if (all) {
152             // Store span lengths for all span() variants.
153             spanBackLengthsOffset = stringsLength;
154         } else {
155             // Store span lengths for only one span() variant.
156             spanBackLengthsOffset = 0;
157         }
158 
159         // Set the meta data and spanNotSet and write the UTF-8 strings.
160 
161         for (i = 0; i < stringsLength; ++i) {
162             String string = strings.get(i);
163             int length16 = string.length();
164             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
165             if (spanLength < length16) { // Relevant string.
166                 if (true /* 0 != (which & UTF16) */) {
167                     if (0 != (which & CONTAINED)) {
168                         if (0 != (which & FWD)) {
169                             spanLengths[i] = makeSpanLengthByte(spanLength);
170                         }
171                         if (0 != (which & BACK)) {
172                             spanLength = length16
173                                     - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
174                             spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
175                         }
176                     } else /* not CONTAINED, not all, but NOT_CONTAINED */{
177                         spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
178                                                                                      // flag.
179                     }
180                 }
181                 if (0 != (which & NOT_CONTAINED)) {
182                     // Add string start and end code points to the spanNotSet so that
183                     // a span(while not contained) stops before any string.
184                     int c;
185                     if (0 != (which & FWD)) {
186                         c = string.codePointAt(0);
187                         addToSpanNotSet(c);
188                     }
189                     if (0 != (which & BACK)) {
190                         c = string.codePointBefore(length16);
191                         addToSpanNotSet(c);
192                     }
193                 }
194             } else { // Irrelevant string.
195                 if (all) {
196                     spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
197                 } else {
198                     // All spanXYZLengths pointers contain the same address.
199                     spanLengths[i] = ALL_CP_CONTAINED;
200                 }
201             }
202         }
203 
204         // Finish.
205         if (all) {
206             spanNotSet.freeze();
207         }
208     }
209 
210     /**
211      * Constructs a copy of an existing UnicodeSetStringSpan.
212      * Assumes which==ALL for a frozen set.
213      */
UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings)214     public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan,
215             final ArrayList<String> newParentSetStrings) {
216         spanSet = otherStringSpan.spanSet;
217         strings = newParentSetStrings;
218         maxLength16 = otherStringSpan.maxLength16;
219         someRelevant = otherStringSpan.someRelevant;
220         all = true;
221         if (Utility.sameObjects(otherStringSpan.spanNotSet, otherStringSpan.spanSet)) {
222             spanNotSet = spanSet;
223         } else {
224             spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone();
225         }
226         offsets = new OffsetList();
227 
228         spanLengths = otherStringSpan.spanLengths.clone();
229     }
230 
231     /**
232      * Do the strings need to be checked in span() etc.?
233      *
234      * @return true if strings need to be checked (call span() here),
235      *         false if not (use a BMPSet for best performance).
236      */
needsStringSpanUTF16()237     public boolean needsStringSpanUTF16() {
238         return someRelevant;
239     }
240 
241     /** For fast UnicodeSet::contains(c). */
contains(int c)242     public boolean contains(int c) {
243         return spanSet.contains(c);
244     }
245 
246     /**
247      * Adds a starting or ending string character to the spanNotSet
248      * so that a character span ends before any string.
249      */
addToSpanNotSet(int c)250     private void addToSpanNotSet(int c) {
251         if (Utility.sameObjects(spanNotSet, null) || Utility.sameObjects(spanNotSet, spanSet)) {
252             if (spanSet.contains(c)) {
253                 return; // Nothing to do.
254             }
255             spanNotSet = spanSet.cloneAsThawed();
256         }
257         spanNotSet.add(c);
258     }
259 
260     /*
261      * Note: In span() when spanLength==0
262      * (after a string match, or at the beginning after an empty code point span)
263      * and in spanNot() and spanNotUTF8(),
264      * string matching could use a binary search because all string matches are done
265      * from the same start index.
266      *
267      * For UTF-8, this would require a comparison function that returns UTF-16 order.
268      *
269      * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
270      * with strings have very few very short strings. For cases with many strings, it might be better to use a different
271      * API and implementation with a DFA (state machine).
272      */
273 
274     /*
275      * Algorithm for span(SpanCondition.CONTAINED)
276      *
277      * Theoretical algorithm:
278      * - Iterate through the string, and at each code point boundary:
279      *   + If the code point there is in the set, then remember to continue after it.
280      *   + If a set string matches at the current position, then remember to continue after it.
281      *   + Either recursively span for each code point or string match, or recursively span
282      *     for all but the shortest one and iteratively continue the span with the shortest local match.
283      *   + Remember the longest recursive span (the farthest end point).
284      *   + If there is no match at the current position,
285      *     neither for the code point there nor for any set string,
286      *     then stop and return the longest recursive span length.
287      *
288      * Optimized implementation:
289      *
290      * (We assume that most sets will have very few very short strings.
291      * A span using a string-less set is extremely fast.)
292      *
293      * Create and cache a spanSet which contains all of the single code points of the original set
294      * but none of its strings.
295      *
296      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
297      * - Loop:
298      *   + Try to match each set string at the end of the spanLength.
299      *     ~ Set strings that start with set-contained code points
300      *       must be matched with a partial overlap
301      *       because the recursive algorithm would have tried to match them at every position.
302      *     ~ Set strings that entirely consist of set-contained code points
303      *       are irrelevant for span(SpanCondition.CONTAINED)
304      *       because the recursive algorithm would continue after them anyway and
305      *       find the longest recursive match from their end.
306      *     ~ Rather than recursing, note each end point of a set string match.
307      *   + If no set string matched after spanSet.span(),
308      *     then return with where the spanSet.span() ended.
309      *   + If at least one set string matched after spanSet.span(),
310      *     then pop the shortest string match end point and continue the loop,
311      *     trying to match all set strings from there.
312      *   + If at least one more set string matched after a previous string match, then test if the
313      *     code point after the previous string match is also contained in the set.
314      *     Continue the loop with the shortest end point of
315      *     either this code point or a matching set string.
316      *   + If no more set string matched after a previous string match,
317      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
318      *     Stop if spanLength==0, otherwise continue the loop.
319      *
320      * By noting each end point of a set string match, the function visits each string position at most once and
321      * finishes in linear time.
322      *
323      * The recursive algorithm may visit the same string position many times
324      * if multiple paths lead to it and finishes in exponential time.
325      */
326 
327     /*
328      * Algorithm for span(SIMPLE)
329      *
330      * Theoretical algorithm:
331      * - Iterate through the string, and at each code point boundary:
332      *   + If the code point there is in the set, then remember to continue after it.
333      *   + If a set string matches at the current position, then remember to continue after it.
334      *   + Continue from the farthest match position and ignore all others.
335      *   + If there is no match at the current position, then stop and return the current position.
336      *
337      * Optimized implementation:
338      *
339      * (Same assumption and spanSet as above.)
340      *
341      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
342      * - Loop:
343      *   + Try to match each set string at the end of the spanLength.
344      *     ~ Set strings that start with set-contained code points
345      *       must be matched with a partial overlap
346      *       because the standard algorithm would have tried to match them earlier.
347      *     ~ Set strings that entirely consist of set-contained code points
348      *       must be matched with a full overlap because the longest-match algorithm
349      *       would hide set string matches that end earlier.
350      *       Such set strings need not be matched earlier inside the code point span
351      *       because the standard algorithm would then have
352      *       continued after the set string match anyway.
353      *     ~ Remember the longest set string match (farthest end point)
354      *       from the earliest starting point.
355      *   + If no set string matched after spanSet.span(),
356      *     then return with where the spanSet.span() ended.
357      *   + If at least one set string matched,
358      *     then continue the loop after the longest match from the earliest position.
359      *   + If no more set string matched after a previous string match,
360      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
361      *     Stop if spanLength==0, otherwise continue the loop.
362      */
363     /**
364      * Spans a string.
365      *
366      * @param s The string to be spanned
367      * @param start The start index that the span begins
368      * @param spanCondition The span condition
369      * @return the limit (exclusive end) of the span
370      */
span(CharSequence s, int start, SpanCondition spanCondition)371     public int span(CharSequence s, int start, SpanCondition spanCondition) {
372         if (spanCondition == SpanCondition.NOT_CONTAINED) {
373             return spanNot(s, start, null);
374         }
375         int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
376         if (spanLimit == s.length()) {
377             return spanLimit;
378         }
379         return spanWithStrings(s, start, spanLimit, spanCondition);
380     }
381 
382     /**
383      * Synchronized method for complicated spans using the offsets.
384      * Avoids synchronization for simple cases.
385      *
386      * @param spanLimit = spanSet.span(s, start, CONTAINED)
387      */
spanWithStrings(CharSequence s, int start, int spanLimit, SpanCondition spanCondition)388     private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
389             SpanCondition spanCondition) {
390         // Consider strings; they may overlap with the span.
391         int initSize = 0;
392         if (spanCondition == SpanCondition.CONTAINED) {
393             // Use offset list to try all possibilities.
394             initSize = maxLength16;
395         }
396         offsets.setMaxLength(initSize);
397         int length = s.length();
398         int pos = spanLimit, rest = length - spanLimit;
399         int spanLength = spanLimit - start;
400         int i, stringsLength = strings.size();
401         for (;;) {
402             if (spanCondition == SpanCondition.CONTAINED) {
403                 for (i = 0; i < stringsLength; ++i) {
404                     int overlap = spanLengths[i];
405                     if (overlap == ALL_CP_CONTAINED) {
406                         continue; // Irrelevant string.
407                     }
408                     String string = strings.get(i);
409 
410                     int length16 = string.length();
411 
412                     // Try to match this string at pos-overlap..pos.
413                     if (overlap >= LONG_SPAN) {
414                         overlap = length16;
415                         // While contained: No point matching fully inside the code point span.
416                         overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
417                                                                           // point.
418                     }
419                     if (overlap > spanLength) {
420                         overlap = spanLength;
421                     }
422                     int inc = length16 - overlap; // Keep overlap+inc==length16.
423                     for (;;) {
424                         if (inc > rest) {
425                             break;
426                         }
427                         // Try to match if the increment is not listed already.
428                         if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
429                             if (inc == rest) {
430                                 return length; // Reached the end of the string.
431                             }
432                             offsets.addOffset(inc);
433                         }
434                         if (overlap == 0) {
435                             break;
436                         }
437                         --overlap;
438                         ++inc;
439                     }
440                 }
441             } else /* SIMPLE */{
442                 int maxInc = 0, maxOverlap = 0;
443                 for (i = 0; i < stringsLength; ++i) {
444                     int overlap = spanLengths[i];
445                     // For longest match, we do need to try to match even an all-contained string
446                     // to find the match from the earliest start.
447 
448                     String string = strings.get(i);
449 
450                     int length16 = string.length();
451 
452                     // Try to match this string at pos-overlap..pos.
453                     if (overlap >= LONG_SPAN) {
454                         overlap = length16;
455                         // Longest match: Need to match fully inside the code point span
456                         // to find the match from the earliest start.
457                     }
458                     if (overlap > spanLength) {
459                         overlap = spanLength;
460                     }
461                     int inc = length16 - overlap; // Keep overlap+inc==length16.
462                     for (;;) {
463                         if (inc > rest || overlap < maxOverlap) {
464                             break;
465                         }
466                         // Try to match if the string is longer or starts earlier.
467                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
468                                 && matches16CPB(s, pos - overlap, length, string, length16)) {
469                             maxInc = inc; // Longest match from earliest start.
470                             maxOverlap = overlap;
471                             break;
472                         }
473                         --overlap;
474                         ++inc;
475                     }
476                 }
477 
478                 if (maxInc != 0 || maxOverlap != 0) {
479                     // Longest-match algorithm, and there was a string match.
480                     // Simply continue after it.
481                     pos += maxInc;
482                     rest -= maxInc;
483                     if (rest == 0) {
484                         return length; // Reached the end of the string.
485                     }
486                     spanLength = 0; // Match strings from after a string match.
487                     continue;
488                 }
489             }
490             // Finished trying to match all strings at pos.
491 
492             if (spanLength != 0 || pos == 0) {
493                 // The position is after an unlimited code point span (spanLength!=0),
494                 // not after a string match.
495                 // The only position where spanLength==0 after a span is pos==0.
496                 // Otherwise, an unlimited code point span is only tried again when no
497                 // strings match, and if such a non-initial span fails we stop.
498                 if (offsets.isEmpty()) {
499                     return pos; // No strings matched after a span.
500                 }
501                 // Match strings from after the next string match.
502             } else {
503                 // The position is after a string match (or a single code point).
504                 if (offsets.isEmpty()) {
505                     // No more strings matched after a previous string match.
506                     // Try another code point span from after the last string match.
507                     spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
508                     spanLength = spanLimit - pos;
509                     if (spanLength == rest || // Reached the end of the string, or
510                             spanLength == 0 // neither strings nor span progressed.
511                     ) {
512                         return spanLimit;
513                     }
514                     pos += spanLength;
515                     rest -= spanLength;
516                     continue; // spanLength>0: Match strings from after a span.
517                 } else {
518                     // Try to match only one code point from after a string match if some
519                     // string matched beyond it, so that we try all possible positions
520                     // and don't overshoot.
521                     spanLength = spanOne(spanSet, s, pos, rest);
522                     if (spanLength > 0) {
523                         if (spanLength == rest) {
524                             return length; // Reached the end of the string.
525                         }
526                         // Match strings after this code point.
527                         // There cannot be any increments below it because UnicodeSet strings
528                         // contain multiple code points.
529                         pos += spanLength;
530                         rest -= spanLength;
531                         offsets.shift(spanLength);
532                         spanLength = 0;
533                         continue; // Match strings from after a single code point.
534                     }
535                     // Match strings from after the next string match.
536                 }
537             }
538             int minOffset = offsets.popMinimum(null);
539             pos += minOffset;
540             rest -= minOffset;
541             spanLength = 0; // Match strings from after a string match.
542         }
543     }
544 
545     /**
546      * Spans a string and counts the smallest number of set elements on any path across the span.
547      *
548      * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
549      *
550      * <p>If the set does not have any fully-contained strings, then we could optimize this
551      * like span(), but such sets are likely rare, and this is at least still linear.
552      *
553      * @param s The string to be spanned
554      * @param start The start index that the span begins
555      * @param spanCondition The span condition
556      * @param outCount The count
557      * @return the limit (exclusive end) of the span
558      */
spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)559     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
560             OutputInt outCount) {
561         if (spanCondition == SpanCondition.NOT_CONTAINED) {
562             return spanNot(s, start, outCount);
563         }
564         // Consider strings; they may overlap with the span,
565         // and they may result in a smaller count that with just code points.
566         if (spanCondition == SpanCondition.CONTAINED) {
567             return spanContainedAndCount(s, start, outCount);
568         }
569         // SIMPLE (not synchronized, does not use offsets)
570         int stringsLength = strings.size();
571         int length = s.length();
572         int pos = start;
573         int rest = length - start;
574         int count = 0;
575         while (rest != 0) {
576             // Try to match the next code point.
577             int cpLength = spanOne(spanSet, s, pos, rest);
578             int maxInc = (cpLength > 0) ? cpLength : 0;
579             // Try to match all of the strings.
580             for (int i = 0; i < stringsLength; ++i) {
581                 String string = strings.get(i);
582                 int length16 = string.length();
583                 if (maxInc < length16 && length16 <= rest &&
584                         matches16CPB(s, pos, length, string, length16)) {
585                     maxInc = length16;
586                 }
587             }
588             // We are done if there is no match beyond pos.
589             if (maxInc == 0) {
590                 outCount.value = count;
591                 return pos;
592             }
593             // Continue from the longest match.
594             ++count;
595             pos += maxInc;
596             rest -= maxInc;
597         }
598         outCount.value = count;
599         return pos;
600     }
601 
spanContainedAndCount(CharSequence s, int start, OutputInt outCount)602     private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
603         // Use offset list to try all possibilities.
604         offsets.setMaxLength(maxLength16);
605         int stringsLength = strings.size();
606         int length = s.length();
607         int pos = start;
608         int rest = length - start;
609         int count = 0;
610         while (rest != 0) {
611             // Try to match the next code point.
612             int cpLength = spanOne(spanSet, s, pos, rest);
613             if (cpLength > 0) {
614                 offsets.addOffsetAndCount(cpLength, count + 1);
615             }
616             // Try to match all of the strings.
617             for (int i = 0; i < stringsLength; ++i) {
618                 String string = strings.get(i);
619                 int length16 = string.length();
620                 // Note: If the strings were sorted by length, then we could also
621                 // avoid trying to match if there is already a match of the same length.
622                 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
623                         matches16CPB(s, pos, length, string, length16)) {
624                     offsets.addOffsetAndCount(length16, count + 1);
625                 }
626             }
627             // We are done if there is no match beyond pos.
628             if (offsets.isEmpty()) {
629                 outCount.value = count;
630                 return pos;
631             }
632             // Continue from the nearest match.
633             int minOffset = offsets.popMinimum(outCount);
634             count = outCount.value;
635             pos += minOffset;
636             rest -= minOffset;
637         }
638         outCount.value = count;
639         return pos;
640     }
641 
642     /**
643      * Span a string backwards.
644      *
645      * @param s The string to be spanned
646      * @param spanCondition The span condition
647      * @return The string index which starts the span (i.e. inclusive).
648      */
spanBack(CharSequence s, int length, SpanCondition spanCondition)649     public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
650         if (spanCondition == SpanCondition.NOT_CONTAINED) {
651             return spanNotBack(s, length);
652         }
653         int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
654         if (pos == 0) {
655             return 0;
656         }
657         int spanLength = length - pos;
658 
659         // Consider strings; they may overlap with the span.
660         int initSize = 0;
661         if (spanCondition == SpanCondition.CONTAINED) {
662             // Use offset list to try all possibilities.
663             initSize = maxLength16;
664         }
665         offsets.setMaxLength(initSize);
666         int i, stringsLength = strings.size();
667         int spanBackLengthsOffset = 0;
668         if (all) {
669             spanBackLengthsOffset = stringsLength;
670         }
671         for (;;) {
672             if (spanCondition == SpanCondition.CONTAINED) {
673                 for (i = 0; i < stringsLength; ++i) {
674                     int overlap = spanLengths[spanBackLengthsOffset + i];
675                     if (overlap == ALL_CP_CONTAINED) {
676                         continue; // Irrelevant string.
677                     }
678                     String string = strings.get(i);
679 
680                     int length16 = string.length();
681 
682                     // Try to match this string at pos-(length16-overlap)..pos-length16.
683                     if (overlap >= LONG_SPAN) {
684                         overlap = length16;
685                         // While contained: No point matching fully inside the code point span.
686                         int len1 = 0;
687                         len1 = string.offsetByCodePoints(0, 1);
688                         overlap -= len1; // Length of the string minus the first code point.
689                     }
690                     if (overlap > spanLength) {
691                         overlap = spanLength;
692                     }
693                     int dec = length16 - overlap; // Keep dec+overlap==length16.
694                     for (;;) {
695                         if (dec > pos) {
696                             break;
697                         }
698                         // Try to match if the decrement is not listed already.
699                         if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
700                             if (dec == pos) {
701                                 return 0; // Reached the start of the string.
702                             }
703                             offsets.addOffset(dec);
704                         }
705                         if (overlap == 0) {
706                             break;
707                         }
708                         --overlap;
709                         ++dec;
710                     }
711                 }
712             } else /* SIMPLE */{
713                 int maxDec = 0, maxOverlap = 0;
714                 for (i = 0; i < stringsLength; ++i) {
715                     int overlap = spanLengths[spanBackLengthsOffset + i];
716                     // For longest match, we do need to try to match even an all-contained string
717                     // to find the match from the latest end.
718 
719                     String string = strings.get(i);
720 
721                     int length16 = string.length();
722 
723                     // Try to match this string at pos-(length16-overlap)..pos-length16.
724                     if (overlap >= LONG_SPAN) {
725                         overlap = length16;
726                         // Longest match: Need to match fully inside the code point span
727                         // to find the match from the latest end.
728                     }
729                     if (overlap > spanLength) {
730                         overlap = spanLength;
731                     }
732                     int dec = length16 - overlap; // Keep dec+overlap==length16.
733                     for (;;) {
734                         if (dec > pos || overlap < maxOverlap) {
735                             break;
736                         }
737                         // Try to match if the string is longer or ends later.
738                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
739                                 && matches16CPB(s, pos - dec, length, string, length16)) {
740                             maxDec = dec; // Longest match from latest end.
741                             maxOverlap = overlap;
742                             break;
743                         }
744                         --overlap;
745                         ++dec;
746                     }
747                 }
748 
749                 if (maxDec != 0 || maxOverlap != 0) {
750                     // Longest-match algorithm, and there was a string match.
751                     // Simply continue before it.
752                     pos -= maxDec;
753                     if (pos == 0) {
754                         return 0; // Reached the start of the string.
755                     }
756                     spanLength = 0; // Match strings from before a string match.
757                     continue;
758                 }
759             }
760             // Finished trying to match all strings at pos.
761 
762             if (spanLength != 0 || pos == length) {
763                 // The position is before an unlimited code point span (spanLength!=0),
764                 // not before a string match.
765                 // The only position where spanLength==0 before a span is pos==length.
766                 // Otherwise, an unlimited code point span is only tried again when no
767                 // strings match, and if such a non-initial span fails we stop.
768                 if (offsets.isEmpty()) {
769                     return pos; // No strings matched before a span.
770                 }
771                 // Match strings from before the next string match.
772             } else {
773                 // The position is before a string match (or a single code point).
774                 if (offsets.isEmpty()) {
775                     // No more strings matched before a previous string match.
776                     // Try another code point span from before the last string match.
777                     int oldPos = pos;
778                     pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
779                     spanLength = oldPos - pos;
780                     if (pos == 0 || // Reached the start of the string, or
781                             spanLength == 0 // neither strings nor span progressed.
782                     ) {
783                         return pos;
784                     }
785                     continue; // spanLength>0: Match strings from before a span.
786                 } else {
787                     // Try to match only one code point from before a string match if some
788                     // string matched beyond it, so that we try all possible positions
789                     // and don't overshoot.
790                     spanLength = spanOneBack(spanSet, s, pos);
791                     if (spanLength > 0) {
792                         if (spanLength == pos) {
793                             return 0; // Reached the start of the string.
794                         }
795                         // Match strings before this code point.
796                         // There cannot be any decrements below it because UnicodeSet strings
797                         // contain multiple code points.
798                         pos -= spanLength;
799                         offsets.shift(spanLength);
800                         spanLength = 0;
801                         continue; // Match strings from before a single code point.
802                     }
803                     // Match strings from before the next string match.
804                 }
805             }
806             pos -= offsets.popMinimum(null);
807             spanLength = 0; // Match strings from before a string match.
808         }
809     }
810 
811     /**
812      * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
813      *
814      * Theoretical algorithm:
815      * - Iterate through the string, and at each code point boundary:
816      *   + If the code point there is in the set, then return with the current position.
817      *   + If a set string matches at the current position, then return with the current position.
818      *
819      * Optimized implementation:
820      *
821      * (Same assumption as for span() above.)
822      *
823      * Create and cache a spanNotSet which contains
824      * all of the single code points of the original set but none of its strings.
825      * For each set string add its initial code point to the spanNotSet.
826      * (Also add its final code point for spanNotBack().)
827      *
828      * - Loop:
829      *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
830      *   + If the current code point is in the original set, then return the current position.
831      *   + If any set string matches at the current position, then return the current position.
832      *   + If there is no match at the current position, neither for the code point
833      *     there nor for any set string, then skip this code point and continue the loop.
834      *     This happens for set-string-initial code points that were added to spanNotSet
835      *     when there is not actually a match for such a set string.
836      *
837      * @param s The string to be spanned
838      * @param start The start index that the span begins
839      * @param outCount If not null: Receives the number of code points across the span.
840      * @return the limit (exclusive end) of the span
841      */
spanNot(CharSequence s, int start, OutputInt outCount)842     private int spanNot(CharSequence s, int start, OutputInt outCount) {
843         int length = s.length();
844         int pos = start, rest = length - start;
845         int stringsLength = strings.size();
846         int count = 0;
847         do {
848             // Span until we find a code point from the set,
849             // or a code point that starts or ends some string.
850             int spanLimit;
851             if (outCount == null) {
852                 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
853             } else {
854                 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
855                 outCount.value = count = count + outCount.value;
856             }
857             if (spanLimit == length) {
858                 return length; // Reached the end of the string.
859             }
860             pos = spanLimit;
861             rest = length - spanLimit;
862 
863             // Check whether the current code point is in the original set,
864             // without the string starts and ends.
865             int cpLength = spanOne(spanSet, s, pos, rest);
866             if (cpLength > 0) {
867                 return pos; // There is a set element at pos.
868             }
869 
870             // Try to match the strings at pos.
871             for (int i = 0; i < stringsLength; ++i) {
872                 if (spanLengths[i] == ALL_CP_CONTAINED) {
873                     continue; // Irrelevant string.
874                 }
875                 String string = strings.get(i);
876 
877                 int length16 = string.length();
878                 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
879                     return pos; // There is a set element at pos.
880                 }
881             }
882 
883             // The span(while not contained) ended on a string start/end which is
884             // not in the original set. Skip this code point and continue.
885             // cpLength<0
886             pos -= cpLength;
887             rest += cpLength;
888             ++count;
889         } while (rest != 0);
890         if (outCount != null) {
891             outCount.value = count;
892         }
893         return length; // Reached the end of the string.
894     }
895 
spanNotBack(CharSequence s, int length)896     private int spanNotBack(CharSequence s, int length) {
897         int pos = length;
898         int i, stringsLength = strings.size();
899         do {
900             // Span until we find a code point from the set,
901             // or a code point that starts or ends some string.
902             pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
903             if (pos == 0) {
904                 return 0; // Reached the start of the string.
905             }
906 
907             // Check whether the current code point is in the original set,
908             // without the string starts and ends.
909             int cpLength = spanOneBack(spanSet, s, pos);
910             if (cpLength > 0) {
911                 return pos; // There is a set element at pos.
912             }
913 
914             // Try to match the strings at pos.
915             for (i = 0; i < stringsLength; ++i) {
916                 // Use spanLengths rather than a spanLengths pointer because
917                 // it is easier and we only need to know whether the string is irrelevant
918                 // which is the same in either array.
919                 if (spanLengths[i] == ALL_CP_CONTAINED) {
920                     continue; // Irrelevant string.
921                 }
922                 String string = strings.get(i);
923 
924                 int length16 = string.length();
925                 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
926                     return pos; // There is a set element at pos.
927                 }
928             }
929 
930             // The span(while not contained) ended on a string start/end which is
931             // not in the original set. Skip this code point and continue.
932             // cpLength<0
933             pos += cpLength;
934         } while (pos != 0);
935         return 0; // Reached the start of the string.
936     }
937 
makeSpanLengthByte(int spanLength)938     static short makeSpanLengthByte(int spanLength) {
939         // 0xfe==UnicodeSetStringSpan::LONG_SPAN
940         return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
941     }
942 
943     // Compare strings without any argument checks. Requires length>0.
matches16(CharSequence s, int start, final String t, int length)944     private static boolean matches16(CharSequence s, int start, final String t, int length) {
945         int end = start + length;
946         while (length-- > 0) {
947             if (s.charAt(--end) != t.charAt(length)) {
948                 return false;
949             }
950         }
951         return true;
952     }
953 
954     /**
955      * Compare 16-bit Unicode strings (which may be malformed UTF-16)
956      * at code point boundaries.
957      * That is, each edge of a match must not be in the middle of a surrogate pair.
958      * @param s       The string to match in.
959      * @param start   The start index of s.
960      * @param limit   The limit of the subsequence of s being spanned.
961      * @param t       The substring to be matched in s.
962      * @param tlength The length of t.
963      */
matches16CPB(CharSequence s, int start, int limit, final String t, int tlength)964     static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
965         return matches16(s, start, t, tlength)
966                 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
967                         Character.isLowSurrogate(s.charAt(start)))
968                 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
969                         Character.isLowSurrogate(s.charAt(start + tlength)));
970     }
971 
972     /**
973      * Does the set contain the next code point?
974      * If so, return its length; otherwise return its negative length.
975      */
spanOne(final UnicodeSet set, CharSequence s, int start, int length)976     static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
977         char c = s.charAt(start);
978         if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
979             char c2 = s.charAt(start + 1);
980             if (android.icu.text.UTF16.isTrailSurrogate(c2)) {
981                 int supplementary = Character.toCodePoint(c, c2);
982                 return set.contains(supplementary) ? 2 : -2;
983             }
984         }
985         return set.contains(c) ? 1 : -1;
986     }
987 
spanOneBack(final UnicodeSet set, CharSequence s, int length)988     static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
989         char c = s.charAt(length - 1);
990         if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
991             char c2 = s.charAt(length - 2);
992             if (android.icu.text.UTF16.isLeadSurrogate(c2)) {
993                 int supplementary = Character.toCodePoint(c2, c);
994                 return set.contains(supplementary) ? 2 : -2;
995             }
996         }
997         return set.contains(c) ? 1 : -1;
998     }
999 
1000     /**
1001      * Helper class for UnicodeSetStringSpan.
1002      *
1003      * <p>List of offsets from the current position from where to try matching
1004      * a code point or a string.
1005      * Stores offsets rather than indexes to simplify the code and use the same list
1006      * for both increments (in span()) and decrements (in spanBack()).
1007      *
1008      * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
1009      * are relatively dense, that is,
1010      * there are normally no gaps of hundreds or thousands of offset values.
1011      *
1012      * <p>This class optionally also tracks the minimum non-negative count for each position,
1013      * intended to count the smallest number of elements of any path leading to that position.
1014      *
1015      * <p>The implementation uses a circular buffer of count integers,
1016      * each indicating whether the corresponding offset is in the list,
1017      * and its path element count.
1018      * This avoids inserting into a sorted list of offsets (or absolute indexes)
1019      * and physically moving part of the list.
1020      *
1021      * <p>Note: In principle, the caller should setMaxLength() to
1022      * the maximum of the max string length and U16_LENGTH/U8_LENGTH
1023      * to account for "long" single code points.
1024      *
1025      * <p>Note: An earlier version did not track counts and stored only byte flags.
1026      * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
1027      * the list could be stored as bit flags in a single integer.
1028      * Rather than handling a circular buffer with a start list index,
1029      * the integer would simply be shifted when lower offsets are removed.
1030      * UnicodeSet does not have a limit on the lengths of strings.
1031      */
1032     private static final class OffsetList {
1033         private int[] list;
1034         private int length;
1035         private int start;
1036 
OffsetList()1037         public OffsetList() {
1038             list = new int[16];  // default size
1039         }
1040 
setMaxLength(int maxLength)1041         public void setMaxLength(int maxLength) {
1042             if (maxLength > list.length) {
1043                 list = new int[maxLength];
1044             }
1045             clear();
1046         }
1047 
clear()1048         public void clear() {
1049             for (int i = list.length; i-- > 0;) {
1050                 list[i] = 0;
1051             }
1052             start = length = 0;
1053         }
1054 
isEmpty()1055         public boolean isEmpty() {
1056             return (length == 0);
1057         }
1058 
1059         /**
1060          * Reduces all stored offsets by delta, used when the current position moves by delta.
1061          * There must not be any offsets lower than delta.
1062          * If there is an offset equal to delta, it is removed.
1063          *
1064          * @param delta [1..maxLength]
1065          */
shift(int delta)1066         public void shift(int delta) {
1067             int i = start + delta;
1068             if (i >= list.length) {
1069                 i -= list.length;
1070             }
1071             if (list[i] != 0) {
1072                 list[i] = 0;
1073                 --length;
1074             }
1075             start = i;
1076         }
1077 
1078         /**
1079          * Adds an offset. The list must not contain it yet.
1080          * @param offset [1..maxLength]
1081          */
addOffset(int offset)1082         public void addOffset(int offset) {
1083             int i = start + offset;
1084             if (i >= list.length) {
1085                 i -= list.length;
1086             }
1087             assert list[i] == 0;
1088             list[i] = 1;
1089             ++length;
1090         }
1091 
1092         /**
1093          * Adds an offset and updates its count.
1094          * The list may already contain the offset.
1095          * @param offset [1..maxLength]
1096          */
addOffsetAndCount(int offset, int count)1097         public void addOffsetAndCount(int offset, int count) {
1098             assert count > 0;
1099             int i = start + offset;
1100             if (i >= list.length) {
1101                 i -= list.length;
1102             }
1103             if (list[i] == 0) {
1104                 list[i] = count;
1105                 ++length;
1106             } else if (count < list[i]) {
1107                 list[i] = count;
1108             }
1109         }
1110 
1111         /**
1112          * @param offset [1..maxLength]
1113          */
containsOffset(int offset)1114         public boolean containsOffset(int offset) {
1115             int i = start + offset;
1116             if (i >= list.length) {
1117                 i -= list.length;
1118             }
1119             return list[i] != 0;
1120         }
1121 
1122         /**
1123          * @param offset [1..maxLength]
1124          */
hasCountAtOffset(int offset, int count)1125         public boolean hasCountAtOffset(int offset, int count) {
1126             int i = start + offset;
1127             if (i >= list.length) {
1128                 i -= list.length;
1129             }
1130             int oldCount = list[i];
1131             return oldCount != 0 && oldCount <= count;
1132         }
1133 
1134         /**
1135          * Finds the lowest stored offset from a non-empty list, removes it,
1136          * and reduces all other offsets by this minimum.
1137          * @return min=[1..maxLength]
1138          */
popMinimum(OutputInt outCount)1139         public int popMinimum(OutputInt outCount) {
1140             // Look for the next offset in list[start+1..list.length-1].
1141             int i = start, result;
1142             while (++i < list.length) {
1143                 int count = list[i];
1144                 if (count != 0) {
1145                     list[i] = 0;
1146                     --length;
1147                     result = i - start;
1148                     start = i;
1149                     if (outCount != null) { outCount.value = count; }
1150                     return result;
1151                 }
1152             }
1153             // i==list.length
1154 
1155             // Wrap around and look for the next offset in list[0..start].
1156             // Since the list is not empty, there will be one.
1157             result = list.length - start;
1158             i = 0;
1159             int count;
1160             while ((count = list[i]) == 0) {
1161                 ++i;
1162             }
1163             list[i] = 0;
1164             --length;
1165             start = i;
1166             if (outCount != null) { outCount.value = count; }
1167             return result + i;
1168         }
1169     }
1170 }
1171