• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ******************************************************************************
5  *
6  *   Copyright (C) 2009-2015, International Business Machines
7  *   Corporation and others.  All Rights Reserved.
8  *
9  ******************************************************************************
10  */
11 
12 package com.ibm.icu.impl;
13 
14 import java.util.ArrayList;
15 
16 import com.ibm.icu.text.UnicodeSet;
17 import com.ibm.icu.text.UnicodeSet.SpanCondition;
18 import com.ibm.icu.util.OutputInt;
19 
20 /*
21  * Implement span() etc. for a set with strings.
22  * Avoid recursion because of its exponential complexity.
23  * Instead, try multiple paths at once and track them with an IndexList.
24  */
25 public class UnicodeSetStringSpan {
26 
27     /*
28      * Which span() variant will be used? The object is either built for one variant and used once,
29      * or built for all and may be used many times.
30      */
31     public static final int WITH_COUNT    = 0x40;  // spanAndCount() may be called
32     public static final int FWD           = 0x20;
33     public static final int BACK          = 0x10;
34     // public static final int UTF16      = 8;
35     public static final int CONTAINED     = 2;
36     public static final int NOT_CONTAINED = 1;
37 
38     public static final int ALL = 0x7f;
39 
40     public static final int FWD_UTF16_CONTAINED      = FWD  | /* UTF16 | */    CONTAINED;
41     public static final int FWD_UTF16_NOT_CONTAINED  = FWD  | /* UTF16 | */NOT_CONTAINED;
42     public static final int BACK_UTF16_CONTAINED     = BACK | /* UTF16 | */    CONTAINED;
43     public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED;
44 
45     /**
46      * Special spanLength short values. (since Java has not unsigned byte type)
47      * All code points in the string are contained in the parent set.
48      */
49     static final short ALL_CP_CONTAINED = 0xff;
50     /** The spanLength is >=0xfe. */
51     static final short LONG_SPAN = ALL_CP_CONTAINED - 1;
52 
53     /** Set for span(). Same as parent but without strings. */
54     private UnicodeSet spanSet;
55 
56     /**
57      * Set for span(not contained).
58      * Same as spanSet, plus characters that start or end strings.
59      */
60     private UnicodeSet spanNotSet;
61 
62     /** The strings of the parent set. */
63     private ArrayList<String> strings;
64 
65     /** The lengths of span(), spanBack() etc. for each string. */
66     private short[] spanLengths;
67 
68     /** Maximum lengths of relevant strings. */
69     private final int maxLength16;
70 
71     /** Are there strings that are not fully contained in the code point set? */
72     private boolean someRelevant;
73 
74     /** Set up for all variants of span()? */
75     private boolean all;
76 
77     /** Span helper */
78     private OffsetList offsets;
79 
80     /**
81      * Constructs for all variants of span(), or only for any one variant.
82      * Initializes as little as possible, for single use.
83      */
UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which)84     public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) {
85         spanSet = new UnicodeSet(0, 0x10ffff);
86         // TODO: With Java 6, just take the parent set's strings as is,
87         // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings.
88         // Then iterate via the first() and higher() methods.
89         // (We do not want to create multiple Iterator objects in each span().)
90         // See ICU ticket #7454.
91         strings = setStrings;
92         all = (which == ALL);
93         spanSet.retainAll(set);
94         if (0 != (which & NOT_CONTAINED)) {
95             // Default to the same sets.
96             // addToSpanNotSet() will create a separate set if necessary.
97             spanNotSet = spanSet;
98         }
99         offsets = new OffsetList();
100 
101         // Determine if the strings even need to be taken into account at all for span() etc.
102         // If any string is relevant, then all strings need to be used for
103         // span(longest match) but only the relevant ones for span(while contained).
104         // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
105         // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
106         // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
107         // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
108         int stringsLength = strings.size();
109 
110         int i, spanLength;
111         int maxLength16 = 0;
112         someRelevant = false;
113         for (i = 0; i < stringsLength;) {
114             String string = strings.get(i);
115             int length16 = string.length();
116             if (length16 == 0) {
117                 // Remove the empty string.
118                 strings.remove(i);
119                 --stringsLength;
120                 continue;
121             }
122             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
123             if (spanLength < length16) { // Relevant string.
124                 someRelevant = true;
125             }
126             if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
127                 maxLength16 = length16;
128             }
129             ++i;
130         }
131         this.maxLength16 = maxLength16;
132         if (!someRelevant && (which & WITH_COUNT) == 0) {
133             return;
134         }
135 
136         // Freeze after checking for the need to use strings at all because freezing
137         // a set takes some time and memory which are wasted if there are no relevant strings.
138         if (all) {
139             spanSet.freeze();
140         }
141 
142         int spanBackLengthsOffset;
143 
144         // Allocate a block of meta data.
145         int allocSize;
146         if (all) {
147             // 2 sets of span lengths
148             allocSize = stringsLength * (2);
149         } else {
150             allocSize = stringsLength; // One set of span lengths.
151         }
152         spanLengths = new short[allocSize];
153 
154         if (all) {
155             // Store span lengths for all span() variants.
156             spanBackLengthsOffset = stringsLength;
157         } else {
158             // Store span lengths for only one span() variant.
159             spanBackLengthsOffset = 0;
160         }
161 
162         // Set the meta data and spanNotSet and write the UTF-8 strings.
163 
164         for (i = 0; i < stringsLength; ++i) {
165             String string = strings.get(i);
166             int length16 = string.length();
167             spanLength = spanSet.span(string, SpanCondition.CONTAINED);
168             if (spanLength < length16) { // Relevant string.
169                 if (true /* 0 != (which & UTF16) */) {
170                     if (0 != (which & CONTAINED)) {
171                         if (0 != (which & FWD)) {
172                             spanLengths[i] = makeSpanLengthByte(spanLength);
173                         }
174                         if (0 != (which & BACK)) {
175                             spanLength = length16
176                                     - spanSet.spanBack(string, length16, SpanCondition.CONTAINED);
177                             spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength);
178                         }
179                     } else /* not CONTAINED, not all, but NOT_CONTAINED */{
180                         spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant
181                                                                                      // flag.
182                     }
183                 }
184                 if (0 != (which & NOT_CONTAINED)) {
185                     // Add string start and end code points to the spanNotSet so that
186                     // a span(while not contained) stops before any string.
187                     int c;
188                     if (0 != (which & FWD)) {
189                         c = string.codePointAt(0);
190                         addToSpanNotSet(c);
191                     }
192                     if (0 != (which & BACK)) {
193                         c = string.codePointBefore(length16);
194                         addToSpanNotSet(c);
195                     }
196                 }
197             } else { // Irrelevant string.
198                 if (all) {
199                     spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED;
200                 } else {
201                     // All spanXYZLengths pointers contain the same address.
202                     spanLengths[i] = ALL_CP_CONTAINED;
203                 }
204             }
205         }
206 
207         // Finish.
208         if (all) {
209             spanNotSet.freeze();
210         }
211     }
212 
213     /**
214      * Constructs a copy of an existing UnicodeSetStringSpan.
215      * Assumes which==ALL for a frozen set.
216      */
UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings)217     public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan,
218             final ArrayList<String> newParentSetStrings) {
219         spanSet = otherStringSpan.spanSet;
220         strings = newParentSetStrings;
221         maxLength16 = otherStringSpan.maxLength16;
222         someRelevant = otherStringSpan.someRelevant;
223         all = true;
224         if (Utility.sameObjects(otherStringSpan.spanNotSet, otherStringSpan.spanSet)) {
225             spanNotSet = spanSet;
226         } else {
227             spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone();
228         }
229         offsets = new OffsetList();
230 
231         spanLengths = otherStringSpan.spanLengths.clone();
232     }
233 
234     /**
235      * Do the strings need to be checked in span() etc.?
236      *
237      * @return true if strings need to be checked (call span() here),
238      *         false if not (use a BMPSet for best performance).
239      */
needsStringSpanUTF16()240     public boolean needsStringSpanUTF16() {
241         return someRelevant;
242     }
243 
244     /** For fast UnicodeSet::contains(c). */
contains(int c)245     public boolean contains(int c) {
246         return spanSet.contains(c);
247     }
248 
249     /**
250      * Adds a starting or ending string character to the spanNotSet
251      * so that a character span ends before any string.
252      */
addToSpanNotSet(int c)253     private void addToSpanNotSet(int c) {
254         if (Utility.sameObjects(spanNotSet, null) || Utility.sameObjects(spanNotSet, spanSet)) {
255             if (spanSet.contains(c)) {
256                 return; // Nothing to do.
257             }
258             spanNotSet = spanSet.cloneAsThawed();
259         }
260         spanNotSet.add(c);
261     }
262 
263     /*
264      * Note: In span() when spanLength==0
265      * (after a string match, or at the beginning after an empty code point span)
266      * and in spanNot() and spanNotUTF8(),
267      * string matching could use a binary search because all string matches are done
268      * from the same start index.
269      *
270      * For UTF-8, this would require a comparison function that returns UTF-16 order.
271      *
272      * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets
273      * with strings have very few very short strings. For cases with many strings, it might be better to use a different
274      * API and implementation with a DFA (state machine).
275      */
276 
277     /*
278      * Algorithm for span(SpanCondition.CONTAINED)
279      *
280      * Theoretical algorithm:
281      * - Iterate through the string, and at each code point boundary:
282      *   + If the code point there is in the set, then remember to continue after it.
283      *   + If a set string matches at the current position, then remember to continue after it.
284      *   + Either recursively span for each code point or string match, or recursively span
285      *     for all but the shortest one and iteratively continue the span with the shortest local match.
286      *   + Remember the longest recursive span (the farthest end point).
287      *   + If there is no match at the current position,
288      *     neither for the code point there nor for any set string,
289      *     then stop and return the longest recursive span length.
290      *
291      * Optimized implementation:
292      *
293      * (We assume that most sets will have very few very short strings.
294      * A span using a string-less set is extremely fast.)
295      *
296      * Create and cache a spanSet which contains all of the single code points of the original set
297      * but none of its strings.
298      *
299      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
300      * - Loop:
301      *   + Try to match each set string at the end of the spanLength.
302      *     ~ Set strings that start with set-contained code points
303      *       must be matched with a partial overlap
304      *       because the recursive algorithm would have tried to match them at every position.
305      *     ~ Set strings that entirely consist of set-contained code points
306      *       are irrelevant for span(SpanCondition.CONTAINED)
307      *       because the recursive algorithm would continue after them anyway and
308      *       find the longest recursive match from their end.
309      *     ~ Rather than recursing, note each end point of a set string match.
310      *   + If no set string matched after spanSet.span(),
311      *     then return with where the spanSet.span() ended.
312      *   + If at least one set string matched after spanSet.span(),
313      *     then pop the shortest string match end point and continue the loop,
314      *     trying to match all set strings from there.
315      *   + If at least one more set string matched after a previous string match, then test if the
316      *     code point after the previous string match is also contained in the set.
317      *     Continue the loop with the shortest end point of
318      *     either this code point or a matching set string.
319      *   + If no more set string matched after a previous string match,
320      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
321      *     Stop if spanLength==0, otherwise continue the loop.
322      *
323      * By noting each end point of a set string match, the function visits each string position at most once and
324      * finishes in linear time.
325      *
326      * The recursive algorithm may visit the same string position many times
327      * if multiple paths lead to it and finishes in exponential time.
328      */
329 
330     /*
331      * Algorithm for span(SIMPLE)
332      *
333      * Theoretical algorithm:
334      * - Iterate through the string, and at each code point boundary:
335      *   + If the code point there is in the set, then remember to continue after it.
336      *   + If a set string matches at the current position, then remember to continue after it.
337      *   + Continue from the farthest match position and ignore all others.
338      *   + If there is no match at the current position, then stop and return the current position.
339      *
340      * Optimized implementation:
341      *
342      * (Same assumption and spanSet as above.)
343      *
344      * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED).
345      * - Loop:
346      *   + Try to match each set string at the end of the spanLength.
347      *     ~ Set strings that start with set-contained code points
348      *       must be matched with a partial overlap
349      *       because the standard algorithm would have tried to match them earlier.
350      *     ~ Set strings that entirely consist of set-contained code points
351      *       must be matched with a full overlap because the longest-match algorithm
352      *       would hide set string matches that end earlier.
353      *       Such set strings need not be matched earlier inside the code point span
354      *       because the standard algorithm would then have
355      *       continued after the set string match anyway.
356      *     ~ Remember the longest set string match (farthest end point)
357      *       from the earliest starting point.
358      *   + If no set string matched after spanSet.span(),
359      *     then return with where the spanSet.span() ended.
360      *   + If at least one set string matched,
361      *     then continue the loop after the longest match from the earliest position.
362      *   + If no more set string matched after a previous string match,
363      *     then try another spanLength=spanSet.span(SpanCondition.CONTAINED).
364      *     Stop if spanLength==0, otherwise continue the loop.
365      */
366     /**
367      * Spans a string.
368      *
369      * @param s The string to be spanned
370      * @param start The start index that the span begins
371      * @param spanCondition The span condition
372      * @return the limit (exclusive end) of the span
373      */
span(CharSequence s, int start, SpanCondition spanCondition)374     public int span(CharSequence s, int start, SpanCondition spanCondition) {
375         if (spanCondition == SpanCondition.NOT_CONTAINED) {
376             return spanNot(s, start, null);
377         }
378         int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED);
379         if (spanLimit == s.length()) {
380             return spanLimit;
381         }
382         return spanWithStrings(s, start, spanLimit, spanCondition);
383     }
384 
385     /**
386      * Synchronized method for complicated spans using the offsets.
387      * Avoids synchronization for simple cases.
388      *
389      * @param spanLimit = spanSet.span(s, start, CONTAINED)
390      */
spanWithStrings(CharSequence s, int start, int spanLimit, SpanCondition spanCondition)391     private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit,
392             SpanCondition spanCondition) {
393         // Consider strings; they may overlap with the span.
394         int initSize = 0;
395         if (spanCondition == SpanCondition.CONTAINED) {
396             // Use offset list to try all possibilities.
397             initSize = maxLength16;
398         }
399         offsets.setMaxLength(initSize);
400         int length = s.length();
401         int pos = spanLimit, rest = length - spanLimit;
402         int spanLength = spanLimit - start;
403         int i, stringsLength = strings.size();
404         for (;;) {
405             if (spanCondition == SpanCondition.CONTAINED) {
406                 for (i = 0; i < stringsLength; ++i) {
407                     int overlap = spanLengths[i];
408                     if (overlap == ALL_CP_CONTAINED) {
409                         continue; // Irrelevant string.
410                     }
411                     String string = strings.get(i);
412 
413                     int length16 = string.length();
414 
415                     // Try to match this string at pos-overlap..pos.
416                     if (overlap >= LONG_SPAN) {
417                         overlap = length16;
418                         // While contained: No point matching fully inside the code point span.
419                         overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code
420                                                                           // point.
421                     }
422                     if (overlap > spanLength) {
423                         overlap = spanLength;
424                     }
425                     int inc = length16 - overlap; // Keep overlap+inc==length16.
426                     for (;;) {
427                         if (inc > rest) {
428                             break;
429                         }
430                         // Try to match if the increment is not listed already.
431                         if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) {
432                             if (inc == rest) {
433                                 return length; // Reached the end of the string.
434                             }
435                             offsets.addOffset(inc);
436                         }
437                         if (overlap == 0) {
438                             break;
439                         }
440                         --overlap;
441                         ++inc;
442                     }
443                 }
444             } else /* SIMPLE */{
445                 int maxInc = 0, maxOverlap = 0;
446                 for (i = 0; i < stringsLength; ++i) {
447                     int overlap = spanLengths[i];
448                     // For longest match, we do need to try to match even an all-contained string
449                     // to find the match from the earliest start.
450 
451                     String string = strings.get(i);
452 
453                     int length16 = string.length();
454 
455                     // Try to match this string at pos-overlap..pos.
456                     if (overlap >= LONG_SPAN) {
457                         overlap = length16;
458                         // Longest match: Need to match fully inside the code point span
459                         // to find the match from the earliest start.
460                     }
461                     if (overlap > spanLength) {
462                         overlap = spanLength;
463                     }
464                     int inc = length16 - overlap; // Keep overlap+inc==length16.
465                     for (;;) {
466                         if (inc > rest || overlap < maxOverlap) {
467                             break;
468                         }
469                         // Try to match if the string is longer or starts earlier.
470                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc)
471                                 && matches16CPB(s, pos - overlap, length, string, length16)) {
472                             maxInc = inc; // Longest match from earliest start.
473                             maxOverlap = overlap;
474                             break;
475                         }
476                         --overlap;
477                         ++inc;
478                     }
479                 }
480 
481                 if (maxInc != 0 || maxOverlap != 0) {
482                     // Longest-match algorithm, and there was a string match.
483                     // Simply continue after it.
484                     pos += maxInc;
485                     rest -= maxInc;
486                     if (rest == 0) {
487                         return length; // Reached the end of the string.
488                     }
489                     spanLength = 0; // Match strings from after a string match.
490                     continue;
491                 }
492             }
493             // Finished trying to match all strings at pos.
494 
495             if (spanLength != 0 || pos == 0) {
496                 // The position is after an unlimited code point span (spanLength!=0),
497                 // not after a string match.
498                 // The only position where spanLength==0 after a span is pos==0.
499                 // Otherwise, an unlimited code point span is only tried again when no
500                 // strings match, and if such a non-initial span fails we stop.
501                 if (offsets.isEmpty()) {
502                     return pos; // No strings matched after a span.
503                 }
504                 // Match strings from after the next string match.
505             } else {
506                 // The position is after a string match (or a single code point).
507                 if (offsets.isEmpty()) {
508                     // No more strings matched after a previous string match.
509                     // Try another code point span from after the last string match.
510                     spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED);
511                     spanLength = spanLimit - pos;
512                     if (spanLength == rest || // Reached the end of the string, or
513                             spanLength == 0 // neither strings nor span progressed.
514                     ) {
515                         return spanLimit;
516                     }
517                     pos += spanLength;
518                     rest -= spanLength;
519                     continue; // spanLength>0: Match strings from after a span.
520                 } else {
521                     // Try to match only one code point from after a string match if some
522                     // string matched beyond it, so that we try all possible positions
523                     // and don't overshoot.
524                     spanLength = spanOne(spanSet, s, pos, rest);
525                     if (spanLength > 0) {
526                         if (spanLength == rest) {
527                             return length; // Reached the end of the string.
528                         }
529                         // Match strings after this code point.
530                         // There cannot be any increments below it because UnicodeSet strings
531                         // contain multiple code points.
532                         pos += spanLength;
533                         rest -= spanLength;
534                         offsets.shift(spanLength);
535                         spanLength = 0;
536                         continue; // Match strings from after a single code point.
537                     }
538                     // Match strings from after the next string match.
539                 }
540             }
541             int minOffset = offsets.popMinimum(null);
542             pos += minOffset;
543             rest -= minOffset;
544             spanLength = 0; // Match strings from after a string match.
545         }
546     }
547 
548     /**
549      * Spans a string and counts the smallest number of set elements on any path across the span.
550      *
551      * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans.
552      *
553      * <p>If the set does not have any fully-contained strings, then we could optimize this
554      * like span(), but such sets are likely rare, and this is at least still linear.
555      *
556      * @param s The string to be spanned
557      * @param start The start index that the span begins
558      * @param spanCondition The span condition
559      * @param outCount The count
560      * @return the limit (exclusive end) of the span
561      */
spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)562     public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition,
563             OutputInt outCount) {
564         if (spanCondition == SpanCondition.NOT_CONTAINED) {
565             return spanNot(s, start, outCount);
566         }
567         // Consider strings; they may overlap with the span,
568         // and they may result in a smaller count that with just code points.
569         if (spanCondition == SpanCondition.CONTAINED) {
570             return spanContainedAndCount(s, start, outCount);
571         }
572         // SIMPLE (not synchronized, does not use offsets)
573         int stringsLength = strings.size();
574         int length = s.length();
575         int pos = start;
576         int rest = length - start;
577         int count = 0;
578         while (rest != 0) {
579             // Try to match the next code point.
580             int cpLength = spanOne(spanSet, s, pos, rest);
581             int maxInc = (cpLength > 0) ? cpLength : 0;
582             // Try to match all of the strings.
583             for (int i = 0; i < stringsLength; ++i) {
584                 String string = strings.get(i);
585                 int length16 = string.length();
586                 if (maxInc < length16 && length16 <= rest &&
587                         matches16CPB(s, pos, length, string, length16)) {
588                     maxInc = length16;
589                 }
590             }
591             // We are done if there is no match beyond pos.
592             if (maxInc == 0) {
593                 outCount.value = count;
594                 return pos;
595             }
596             // Continue from the longest match.
597             ++count;
598             pos += maxInc;
599             rest -= maxInc;
600         }
601         outCount.value = count;
602         return pos;
603     }
604 
spanContainedAndCount(CharSequence s, int start, OutputInt outCount)605     private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) {
606         // Use offset list to try all possibilities.
607         offsets.setMaxLength(maxLength16);
608         int stringsLength = strings.size();
609         int length = s.length();
610         int pos = start;
611         int rest = length - start;
612         int count = 0;
613         while (rest != 0) {
614             // Try to match the next code point.
615             int cpLength = spanOne(spanSet, s, pos, rest);
616             if (cpLength > 0) {
617                 offsets.addOffsetAndCount(cpLength, count + 1);
618             }
619             // Try to match all of the strings.
620             for (int i = 0; i < stringsLength; ++i) {
621                 String string = strings.get(i);
622                 int length16 = string.length();
623                 // Note: If the strings were sorted by length, then we could also
624                 // avoid trying to match if there is already a match of the same length.
625                 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) &&
626                         matches16CPB(s, pos, length, string, length16)) {
627                     offsets.addOffsetAndCount(length16, count + 1);
628                 }
629             }
630             // We are done if there is no match beyond pos.
631             if (offsets.isEmpty()) {
632                 outCount.value = count;
633                 return pos;
634             }
635             // Continue from the nearest match.
636             int minOffset = offsets.popMinimum(outCount);
637             count = outCount.value;
638             pos += minOffset;
639             rest -= minOffset;
640         }
641         outCount.value = count;
642         return pos;
643     }
644 
645     /**
646      * Span a string backwards.
647      *
648      * @param s The string to be spanned
649      * @param spanCondition The span condition
650      * @return The string index which starts the span (i.e. inclusive).
651      */
spanBack(CharSequence s, int length, SpanCondition spanCondition)652     public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) {
653         if (spanCondition == SpanCondition.NOT_CONTAINED) {
654             return spanNotBack(s, length);
655         }
656         int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED);
657         if (pos == 0) {
658             return 0;
659         }
660         int spanLength = length - pos;
661 
662         // Consider strings; they may overlap with the span.
663         int initSize = 0;
664         if (spanCondition == SpanCondition.CONTAINED) {
665             // Use offset list to try all possibilities.
666             initSize = maxLength16;
667         }
668         offsets.setMaxLength(initSize);
669         int i, stringsLength = strings.size();
670         int spanBackLengthsOffset = 0;
671         if (all) {
672             spanBackLengthsOffset = stringsLength;
673         }
674         for (;;) {
675             if (spanCondition == SpanCondition.CONTAINED) {
676                 for (i = 0; i < stringsLength; ++i) {
677                     int overlap = spanLengths[spanBackLengthsOffset + i];
678                     if (overlap == ALL_CP_CONTAINED) {
679                         continue; // Irrelevant string.
680                     }
681                     String string = strings.get(i);
682 
683                     int length16 = string.length();
684 
685                     // Try to match this string at pos-(length16-overlap)..pos-length16.
686                     if (overlap >= LONG_SPAN) {
687                         overlap = length16;
688                         // While contained: No point matching fully inside the code point span.
689                         int len1 = 0;
690                         len1 = string.offsetByCodePoints(0, 1);
691                         overlap -= len1; // Length of the string minus the first code point.
692                     }
693                     if (overlap > spanLength) {
694                         overlap = spanLength;
695                     }
696                     int dec = length16 - overlap; // Keep dec+overlap==length16.
697                     for (;;) {
698                         if (dec > pos) {
699                             break;
700                         }
701                         // Try to match if the decrement is not listed already.
702                         if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) {
703                             if (dec == pos) {
704                                 return 0; // Reached the start of the string.
705                             }
706                             offsets.addOffset(dec);
707                         }
708                         if (overlap == 0) {
709                             break;
710                         }
711                         --overlap;
712                         ++dec;
713                     }
714                 }
715             } else /* SIMPLE */{
716                 int maxDec = 0, maxOverlap = 0;
717                 for (i = 0; i < stringsLength; ++i) {
718                     int overlap = spanLengths[spanBackLengthsOffset + i];
719                     // For longest match, we do need to try to match even an all-contained string
720                     // to find the match from the latest end.
721 
722                     String string = strings.get(i);
723 
724                     int length16 = string.length();
725 
726                     // Try to match this string at pos-(length16-overlap)..pos-length16.
727                     if (overlap >= LONG_SPAN) {
728                         overlap = length16;
729                         // Longest match: Need to match fully inside the code point span
730                         // to find the match from the latest end.
731                     }
732                     if (overlap > spanLength) {
733                         overlap = spanLength;
734                     }
735                     int dec = length16 - overlap; // Keep dec+overlap==length16.
736                     for (;;) {
737                         if (dec > pos || overlap < maxOverlap) {
738                             break;
739                         }
740                         // Try to match if the string is longer or ends later.
741                         if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec)
742                                 && matches16CPB(s, pos - dec, length, string, length16)) {
743                             maxDec = dec; // Longest match from latest end.
744                             maxOverlap = overlap;
745                             break;
746                         }
747                         --overlap;
748                         ++dec;
749                     }
750                 }
751 
752                 if (maxDec != 0 || maxOverlap != 0) {
753                     // Longest-match algorithm, and there was a string match.
754                     // Simply continue before it.
755                     pos -= maxDec;
756                     if (pos == 0) {
757                         return 0; // Reached the start of the string.
758                     }
759                     spanLength = 0; // Match strings from before a string match.
760                     continue;
761                 }
762             }
763             // Finished trying to match all strings at pos.
764 
765             if (spanLength != 0 || pos == length) {
766                 // The position is before an unlimited code point span (spanLength!=0),
767                 // not before a string match.
768                 // The only position where spanLength==0 before a span is pos==length.
769                 // Otherwise, an unlimited code point span is only tried again when no
770                 // strings match, and if such a non-initial span fails we stop.
771                 if (offsets.isEmpty()) {
772                     return pos; // No strings matched before a span.
773                 }
774                 // Match strings from before the next string match.
775             } else {
776                 // The position is before a string match (or a single code point).
777                 if (offsets.isEmpty()) {
778                     // No more strings matched before a previous string match.
779                     // Try another code point span from before the last string match.
780                     int oldPos = pos;
781                     pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED);
782                     spanLength = oldPos - pos;
783                     if (pos == 0 || // Reached the start of the string, or
784                             spanLength == 0 // neither strings nor span progressed.
785                     ) {
786                         return pos;
787                     }
788                     continue; // spanLength>0: Match strings from before a span.
789                 } else {
790                     // Try to match only one code point from before a string match if some
791                     // string matched beyond it, so that we try all possible positions
792                     // and don't overshoot.
793                     spanLength = spanOneBack(spanSet, s, pos);
794                     if (spanLength > 0) {
795                         if (spanLength == pos) {
796                             return 0; // Reached the start of the string.
797                         }
798                         // Match strings before this code point.
799                         // There cannot be any decrements below it because UnicodeSet strings
800                         // contain multiple code points.
801                         pos -= spanLength;
802                         offsets.shift(spanLength);
803                         spanLength = 0;
804                         continue; // Match strings from before a single code point.
805                     }
806                     // Match strings from before the next string match.
807                 }
808             }
809             pos -= offsets.popMinimum(null);
810             spanLength = 0; // Match strings from before a string match.
811         }
812     }
813 
814     /**
815      * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED)
816      *
817      * Theoretical algorithm:
818      * - Iterate through the string, and at each code point boundary:
819      *   + If the code point there is in the set, then return with the current position.
820      *   + If a set string matches at the current position, then return with the current position.
821      *
822      * Optimized implementation:
823      *
824      * (Same assumption as for span() above.)
825      *
826      * Create and cache a spanNotSet which contains
827      * all of the single code points of the original set but none of its strings.
828      * For each set string add its initial code point to the spanNotSet.
829      * (Also add its final code point for spanNotBack().)
830      *
831      * - Loop:
832      *   + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED).
833      *   + If the current code point is in the original set, then return the current position.
834      *   + If any set string matches at the current position, then return the current position.
835      *   + If there is no match at the current position, neither for the code point
836      *     there nor for any set string, then skip this code point and continue the loop.
837      *     This happens for set-string-initial code points that were added to spanNotSet
838      *     when there is not actually a match for such a set string.
839      *
840      * @param s The string to be spanned
841      * @param start The start index that the span begins
842      * @param outCount If not null: Receives the number of code points across the span.
843      * @return the limit (exclusive end) of the span
844      */
spanNot(CharSequence s, int start, OutputInt outCount)845     private int spanNot(CharSequence s, int start, OutputInt outCount) {
846         int length = s.length();
847         int pos = start, rest = length - start;
848         int stringsLength = strings.size();
849         int count = 0;
850         do {
851             // Span until we find a code point from the set,
852             // or a code point that starts or ends some string.
853             int spanLimit;
854             if (outCount == null) {
855                 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED);
856             } else {
857                 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount);
858                 outCount.value = count = count + outCount.value;
859             }
860             if (spanLimit == length) {
861                 return length; // Reached the end of the string.
862             }
863             pos = spanLimit;
864             rest = length - spanLimit;
865 
866             // Check whether the current code point is in the original set,
867             // without the string starts and ends.
868             int cpLength = spanOne(spanSet, s, pos, rest);
869             if (cpLength > 0) {
870                 return pos; // There is a set element at pos.
871             }
872 
873             // Try to match the strings at pos.
874             for (int i = 0; i < stringsLength; ++i) {
875                 if (spanLengths[i] == ALL_CP_CONTAINED) {
876                     continue; // Irrelevant string.
877                 }
878                 String string = strings.get(i);
879 
880                 int length16 = string.length();
881                 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) {
882                     return pos; // There is a set element at pos.
883                 }
884             }
885 
886             // The span(while not contained) ended on a string start/end which is
887             // not in the original set. Skip this code point and continue.
888             // cpLength<0
889             pos -= cpLength;
890             rest += cpLength;
891             ++count;
892         } while (rest != 0);
893         if (outCount != null) {
894             outCount.value = count;
895         }
896         return length; // Reached the end of the string.
897     }
898 
spanNotBack(CharSequence s, int length)899     private int spanNotBack(CharSequence s, int length) {
900         int pos = length;
901         int i, stringsLength = strings.size();
902         do {
903             // Span until we find a code point from the set,
904             // or a code point that starts or ends some string.
905             pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED);
906             if (pos == 0) {
907                 return 0; // Reached the start of the string.
908             }
909 
910             // Check whether the current code point is in the original set,
911             // without the string starts and ends.
912             int cpLength = spanOneBack(spanSet, s, pos);
913             if (cpLength > 0) {
914                 return pos; // There is a set element at pos.
915             }
916 
917             // Try to match the strings at pos.
918             for (i = 0; i < stringsLength; ++i) {
919                 // Use spanLengths rather than a spanLengths pointer because
920                 // it is easier and we only need to know whether the string is irrelevant
921                 // which is the same in either array.
922                 if (spanLengths[i] == ALL_CP_CONTAINED) {
923                     continue; // Irrelevant string.
924                 }
925                 String string = strings.get(i);
926 
927                 int length16 = string.length();
928                 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) {
929                     return pos; // There is a set element at pos.
930                 }
931             }
932 
933             // The span(while not contained) ended on a string start/end which is
934             // not in the original set. Skip this code point and continue.
935             // cpLength<0
936             pos += cpLength;
937         } while (pos != 0);
938         return 0; // Reached the start of the string.
939     }
940 
makeSpanLengthByte(int spanLength)941     static short makeSpanLengthByte(int spanLength) {
942         // 0xfe==UnicodeSetStringSpan::LONG_SPAN
943         return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN;
944     }
945 
946     // Compare strings without any argument checks. Requires length>0.
matches16(CharSequence s, int start, final String t, int length)947     private static boolean matches16(CharSequence s, int start, final String t, int length) {
948         int end = start + length;
949         while (length-- > 0) {
950             if (s.charAt(--end) != t.charAt(length)) {
951                 return false;
952             }
953         }
954         return true;
955     }
956 
957     /**
958      * Compare 16-bit Unicode strings (which may be malformed UTF-16)
959      * at code point boundaries.
960      * That is, each edge of a match must not be in the middle of a surrogate pair.
961      * @param s       The string to match in.
962      * @param start   The start index of s.
963      * @param limit   The limit of the subsequence of s being spanned.
964      * @param t       The substring to be matched in s.
965      * @param tlength The length of t.
966      */
matches16CPB(CharSequence s, int start, int limit, final String t, int tlength)967     static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) {
968         return matches16(s, start, t, tlength)
969                 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) &&
970                         Character.isLowSurrogate(s.charAt(start)))
971                 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) &&
972                         Character.isLowSurrogate(s.charAt(start + tlength)));
973     }
974 
975     /**
976      * Does the set contain the next code point?
977      * If so, return its length; otherwise return its negative length.
978      */
spanOne(final UnicodeSet set, CharSequence s, int start, int length)979     static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) {
980         char c = s.charAt(start);
981         if (c >= 0xd800 && c <= 0xdbff && length >= 2) {
982             char c2 = s.charAt(start + 1);
983             if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) {
984                 int supplementary = Character.toCodePoint(c, c2);
985                 return set.contains(supplementary) ? 2 : -2;
986             }
987         }
988         return set.contains(c) ? 1 : -1;
989     }
990 
spanOneBack(final UnicodeSet set, CharSequence s, int length)991     static int spanOneBack(final UnicodeSet set, CharSequence s, int length) {
992         char c = s.charAt(length - 1);
993         if (c >= 0xdc00 && c <= 0xdfff && length >= 2) {
994             char c2 = s.charAt(length - 2);
995             if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) {
996                 int supplementary = Character.toCodePoint(c2, c);
997                 return set.contains(supplementary) ? 2 : -2;
998             }
999         }
1000         return set.contains(c) ? 1 : -1;
1001     }
1002 
1003     /**
1004      * Helper class for UnicodeSetStringSpan.
1005      *
1006      * <p>List of offsets from the current position from where to try matching
1007      * a code point or a string.
1008      * Stores offsets rather than indexes to simplify the code and use the same list
1009      * for both increments (in span()) and decrements (in spanBack()).
1010      *
1011      * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time
1012      * are relatively dense, that is,
1013      * there are normally no gaps of hundreds or thousands of offset values.
1014      *
1015      * <p>This class optionally also tracks the minimum non-negative count for each position,
1016      * intended to count the smallest number of elements of any path leading to that position.
1017      *
1018      * <p>The implementation uses a circular buffer of count integers,
1019      * each indicating whether the corresponding offset is in the list,
1020      * and its path element count.
1021      * This avoids inserting into a sorted list of offsets (or absolute indexes)
1022      * and physically moving part of the list.
1023      *
1024      * <p>Note: In principle, the caller should setMaxLength() to
1025      * the maximum of the max string length and U16_LENGTH/U8_LENGTH
1026      * to account for "long" single code points.
1027      *
1028      * <p>Note: An earlier version did not track counts and stored only byte flags.
1029      * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64,
1030      * the list could be stored as bit flags in a single integer.
1031      * Rather than handling a circular buffer with a start list index,
1032      * the integer would simply be shifted when lower offsets are removed.
1033      * UnicodeSet does not have a limit on the lengths of strings.
1034      */
1035     private static final class OffsetList {
1036         private int[] list;
1037         private int length;
1038         private int start;
1039 
OffsetList()1040         public OffsetList() {
1041             list = new int[16];  // default size
1042         }
1043 
setMaxLength(int maxLength)1044         public void setMaxLength(int maxLength) {
1045             if (maxLength > list.length) {
1046                 list = new int[maxLength];
1047             }
1048             clear();
1049         }
1050 
clear()1051         public void clear() {
1052             for (int i = list.length; i-- > 0;) {
1053                 list[i] = 0;
1054             }
1055             start = length = 0;
1056         }
1057 
isEmpty()1058         public boolean isEmpty() {
1059             return (length == 0);
1060         }
1061 
1062         /**
1063          * Reduces all stored offsets by delta, used when the current position moves by delta.
1064          * There must not be any offsets lower than delta.
1065          * If there is an offset equal to delta, it is removed.
1066          *
1067          * @param delta [1..maxLength]
1068          */
shift(int delta)1069         public void shift(int delta) {
1070             int i = start + delta;
1071             if (i >= list.length) {
1072                 i -= list.length;
1073             }
1074             if (list[i] != 0) {
1075                 list[i] = 0;
1076                 --length;
1077             }
1078             start = i;
1079         }
1080 
1081         /**
1082          * Adds an offset. The list must not contain it yet.
1083          * @param offset [1..maxLength]
1084          */
addOffset(int offset)1085         public void addOffset(int offset) {
1086             int i = start + offset;
1087             if (i >= list.length) {
1088                 i -= list.length;
1089             }
1090             assert list[i] == 0;
1091             list[i] = 1;
1092             ++length;
1093         }
1094 
1095         /**
1096          * Adds an offset and updates its count.
1097          * The list may already contain the offset.
1098          * @param offset [1..maxLength]
1099          */
addOffsetAndCount(int offset, int count)1100         public void addOffsetAndCount(int offset, int count) {
1101             assert count > 0;
1102             int i = start + offset;
1103             if (i >= list.length) {
1104                 i -= list.length;
1105             }
1106             if (list[i] == 0) {
1107                 list[i] = count;
1108                 ++length;
1109             } else if (count < list[i]) {
1110                 list[i] = count;
1111             }
1112         }
1113 
1114         /**
1115          * @param offset [1..maxLength]
1116          */
containsOffset(int offset)1117         public boolean containsOffset(int offset) {
1118             int i = start + offset;
1119             if (i >= list.length) {
1120                 i -= list.length;
1121             }
1122             return list[i] != 0;
1123         }
1124 
1125         /**
1126          * @param offset [1..maxLength]
1127          */
hasCountAtOffset(int offset, int count)1128         public boolean hasCountAtOffset(int offset, int count) {
1129             int i = start + offset;
1130             if (i >= list.length) {
1131                 i -= list.length;
1132             }
1133             int oldCount = list[i];
1134             return oldCount != 0 && oldCount <= count;
1135         }
1136 
1137         /**
1138          * Finds the lowest stored offset from a non-empty list, removes it,
1139          * and reduces all other offsets by this minimum.
1140          * @return min=[1..maxLength]
1141          */
popMinimum(OutputInt outCount)1142         public int popMinimum(OutputInt outCount) {
1143             // Look for the next offset in list[start+1..list.length-1].
1144             int i = start, result;
1145             while (++i < list.length) {
1146                 int count = list[i];
1147                 if (count != 0) {
1148                     list[i] = 0;
1149                     --length;
1150                     result = i - start;
1151                     start = i;
1152                     if (outCount != null) { outCount.value = count; }
1153                     return result;
1154                 }
1155             }
1156             // i==list.length
1157 
1158             // Wrap around and look for the next offset in list[0..start].
1159             // Since the list is not empty, there will be one.
1160             result = list.length - start;
1161             i = 0;
1162             int count;
1163             while ((count = list[i]) == 0) {
1164                 ++i;
1165             }
1166             list[i] = 0;
1167             --length;
1168             start = i;
1169             if (outCount != null) { outCount.value = count; }
1170             return result + i;
1171         }
1172     }
1173 }
1174