• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 *
4 *   Copyright (C) 2007, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 ******************************************************************************
8 *   file name:  unisetspan.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2007mar01
14 *   created by: Markus W. Scherer
15 */
16 
17 #include "unicode/utypes.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ustring.h"
20 #include "cmemory.h"
21 #include "uvector.h"
22 #include "unisetspan.h"
23 
24 U_NAMESPACE_BEGIN
25 
26 /*
27  * List of offsets from the current position from where to try matching
28  * a code point or a string.
29  * Store offsets rather than indexes to simplify the code and use the same list
30  * for both increments (in span()) and decrements (in spanBack()).
31  *
32  * Assumption: The maximum offset is limited, and the offsets that are stored
33  * at any one time are relatively dense, that is, there are normally no gaps of
34  * hundreds or thousands of offset values.
35  *
36  * The implementation uses a circular buffer of byte flags,
37  * each indicating whether the corresponding offset is in the list.
38  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
39  * physically moving part of the list.
40  *
41  * Note: In principle, the caller should setMaxLength() to the maximum of the
42  * max string length and U16_LENGTH/U8_LENGTH to account for
43  * "long" single code points.
44  * However, this implementation uses at least a staticList with more than
45  * U8_LENGTH entries anyway.
46  *
47  * Note: If maxLength were guaranteed to be no more than 32 or 64,
48  * the list could be stored as bit flags in a single integer.
49  * Rather than handling a circular buffer with a start list index,
50  * the integer would simply be shifted when lower offsets are removed.
51  * UnicodeSet does not have a limit on the lengths of strings.
52  */
53 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
54 public:
OffsetList()55     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
56 
~OffsetList()57     ~OffsetList() {
58         if(list!=staticList) {
59             uprv_free(list);
60         }
61     }
62 
63     // Call exactly once if the list is to be used.
setMaxLength(int32_t maxLength)64     void setMaxLength(int32_t maxLength) {
65         if(maxLength<=(int32_t)sizeof(staticList)) {
66             capacity=(int32_t)sizeof(staticList);
67         } else {
68             UBool *l=(UBool *)uprv_malloc(maxLength);
69             if(l!=NULL) {
70                 list=l;
71                 capacity=maxLength;
72             }
73         }
74         uprv_memset(list, 0, capacity);
75     }
76 
clear()77     void clear() {
78         uprv_memset(list, 0, capacity);
79         start=length=0;
80     }
81 
isEmpty() const82     UBool isEmpty() const {
83         return (UBool)(length==0);
84     }
85 
86     // Reduce all stored offsets by delta, used when the current position
87     // moves by delta.
88     // There must not be any offsets lower than delta.
89     // If there is an offset equal to delta, it is removed.
90     // delta=[1..maxLength]
shift(int32_t delta)91     void shift(int32_t delta) {
92         int32_t i=start+delta;
93         if(i>=capacity) {
94             i-=capacity;
95         }
96         if(list[i]) {
97             list[i]=FALSE;
98             --length;
99         }
100         start=i;
101     }
102 
103     // Add an offset. The list must not contain it yet.
104     // offset=[1..maxLength]
addOffset(int32_t offset)105     void addOffset(int32_t offset) {
106         int32_t i=start+offset;
107         if(i>=capacity) {
108             i-=capacity;
109         }
110         list[i]=TRUE;
111         ++length;
112     }
113 
114     // offset=[1..maxLength]
containsOffset(int32_t offset) const115     UBool containsOffset(int32_t offset) const {
116         int32_t i=start+offset;
117         if(i>=capacity) {
118             i-=capacity;
119         }
120         return list[i];
121     }
122 
123     // Find the lowest stored offset from a non-empty list, remove it,
124     // and reduce all other offsets by this minimum.
125     // Returns [1..maxLength].
popMinimum()126     int32_t popMinimum() {
127         // Look for the next offset in list[start+1..capacity-1].
128         int32_t i=start, result;
129         while(++i<capacity) {
130             if(list[i]) {
131                 list[i]=FALSE;
132                 --length;
133                 result=i-start;
134                 start=i;
135                 return result;
136             }
137         }
138         // i==capacity
139 
140         // Wrap around and look for the next offset in list[0..start].
141         // Since the list is not empty, there will be one.
142         result=capacity-start;
143         i=0;
144         while(!list[i]) {
145             ++i;
146         }
147         list[i]=FALSE;
148         --length;
149         start=i;
150         return result+=i;
151     }
152 
153 private:
154     UBool *list;
155     int32_t capacity;
156     int32_t length;
157     int32_t start;
158 
159     UBool staticList[16];
160 };
161 
162 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
163 static int32_t
getUTF8Length(const UChar * s,int32_t length)164 getUTF8Length(const UChar *s, int32_t length) {
165     UErrorCode errorCode=U_ZERO_ERROR;
166     int32_t length8=0;
167     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
168     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
169         return length8;
170     } else {
171         // The string contains an unpaired surrogate.
172         // Ignore this string.
173         return 0;
174     }
175 }
176 
177 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
178 static int32_t
appendUTF8(const UChar * s,int32_t length,uint8_t * t,int32_t capacity)179 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
180     UErrorCode errorCode=U_ZERO_ERROR;
181     int32_t length8=0;
182     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
183     if(U_SUCCESS(errorCode)) {
184         return length8;
185     } else {
186         // The string contains an unpaired surrogate.
187         // Ignore this string.
188         return 0;
189     }
190 }
191 
192 static inline uint8_t
makeSpanLengthByte(int32_t spanLength)193 makeSpanLengthByte(int32_t spanLength) {
194     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
195     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
196 }
197 
198 // Construct for all variants of span(), or only for any one variant.
199 // Initialize as little as possible, for single use.
UnicodeSetStringSpan(const UnicodeSet & set,const UVector & setStrings,uint32_t which)200 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
201                                            const UVector &setStrings,
202                                            uint32_t which)
203         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
204           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
205           utf8Length(0),
206           maxLength16(0), maxLength8(0),
207           all((UBool)(which==ALL)) {
208     spanSet.retainAll(set);
209     if(which&NOT_CONTAINED) {
210         // Default to the same sets.
211         // addToSpanNotSet() will create a separate set if necessary.
212         pSpanNotSet=&spanSet;
213     }
214 
215     // Determine if the strings even need to be taken into account at all for span() etc.
216     // If any string is relevant, then all strings need to be used for
217     // span(longest match) but only the relevant ones for span(while contained).
218     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
219     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
220     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
221     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
222     int32_t stringsLength=strings.size();
223 
224     int32_t i, spanLength;
225     UBool someRelevant=FALSE;
226     for(i=0; i<stringsLength; ++i) {
227         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
228         const UChar *s16=string.getBuffer();
229         int32_t length16=string.length();
230         UBool thisRelevant;
231         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
232         if(spanLength<length16) {  // Relevant string.
233             someRelevant=thisRelevant=TRUE;
234         } else {
235             thisRelevant=FALSE;
236         }
237         if((which&UTF16) && length16>maxLength16) {
238             maxLength16=length16;
239         }
240         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
241             int32_t length8=getUTF8Length(s16, length16);
242             utf8Length+=length8;
243             if(length8>maxLength8) {
244                 maxLength8=length8;
245             }
246         }
247     }
248     if(!someRelevant) {
249         maxLength16=maxLength8=0;
250         return;
251     }
252 
253     // Freeze after checking for the need to use strings at all because freezing
254     // a set takes some time and memory which are wasted if there are no relevant strings.
255     if(all) {
256         spanSet.freeze();
257     }
258 
259     uint8_t *spanBackLengths;
260     uint8_t *spanUTF8Lengths;
261     uint8_t *spanBackUTF8Lengths;
262 
263     // Allocate a block of meta data.
264     int32_t allocSize;
265     if(all) {
266         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
267         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
268     } else {
269         allocSize=stringsLength;  // One set of span lengths.
270         if(which&UTF8) {
271             // UTF-8 lengths and UTF-8 strings.
272             allocSize+=stringsLength*4+utf8Length;
273         }
274     }
275     if(allocSize<=(int32_t)sizeof(staticLengths)) {
276         utf8Lengths=staticLengths;
277     } else {
278         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
279         if(utf8Lengths==NULL) {
280             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
281             return;  // Out of memory.
282         }
283     }
284 
285     if(all) {
286         // Store span lengths for all span() variants.
287         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
288         spanBackLengths=spanLengths+stringsLength;
289         spanUTF8Lengths=spanBackLengths+stringsLength;
290         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
291         utf8=spanBackUTF8Lengths+stringsLength;
292     } else {
293         // Store span lengths for only one span() variant.
294         if(which&UTF8) {
295             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
296             utf8=spanLengths+stringsLength;
297         } else {
298             spanLengths=(uint8_t *)utf8Lengths;
299         }
300         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
301     }
302 
303     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
304     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
305 
306     for(i=0; i<stringsLength; ++i) {
307         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
308         const UChar *s16=string.getBuffer();
309         int32_t length16=string.length();
310         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
311         if(spanLength<length16) {  // Relevant string.
312             if(which&UTF16) {
313                 if(which&CONTAINED) {
314                     if(which&FWD) {
315                         spanLengths[i]=makeSpanLengthByte(spanLength);
316                     }
317                     if(which&BACK) {
318                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
319                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
320                     }
321                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
322                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
323                 }
324             }
325             if(which&UTF8) {
326                 uint8_t *s8=utf8+utf8Count;
327                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
328                 utf8Count+=utf8Lengths[i]=length8;
329                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
330                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
331                 } else {  // Relevant for UTF-8.
332                     if(which&CONTAINED) {
333                         if(which&FWD) {
334                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
335                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
336                         }
337                         if(which&BACK) {
338                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
339                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
340                         }
341                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
342                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
343                     }
344                 }
345             }
346             if(which&NOT_CONTAINED) {
347                 // Add string start and end code points to the spanNotSet so that
348                 // a span(while not contained) stops before any string.
349                 UChar32 c;
350                 if(which&FWD) {
351                     int32_t len=0;
352                     U16_NEXT(s16, len, length16, c);
353                     addToSpanNotSet(c);
354                 }
355                 if(which&BACK) {
356                     int32_t len=length16;
357                     U16_PREV(s16, 0, len, c);
358                     addToSpanNotSet(c);
359                 }
360             }
361         } else {  // Irrelevant string.
362             if(which&UTF8) {
363                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
364                     uint8_t *s8=utf8+utf8Count;
365                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
366                     utf8Count+=utf8Lengths[i]=length8;
367                 } else {
368                     utf8Lengths[i]=0;
369                 }
370             }
371             if(all) {
372                 spanLengths[i]=spanBackLengths[i]=
373                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
374                         (uint8_t)ALL_CP_CONTAINED;
375             } else {
376                 // All spanXYZLengths pointers contain the same address.
377                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
378             }
379         }
380     }
381 
382     // Finish.
383     if(all) {
384         pSpanNotSet->freeze();
385     }
386 }
387 
388 // Copy constructor. Assumes which==ALL for a frozen set.
UnicodeSetStringSpan(const UnicodeSetStringSpan & otherStringSpan,const UVector & newParentSetStrings)389 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
390                                            const UVector &newParentSetStrings)
391         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
392           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
393           utf8Length(otherStringSpan.utf8Length),
394           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
395           all(TRUE) {
396     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
397         pSpanNotSet=&spanSet;
398     } else {
399         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
400     }
401 
402     // Allocate a block of meta data.
403     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
404     int32_t stringsLength=strings.size();
405     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
406     if(allocSize<=(int32_t)sizeof(staticLengths)) {
407         utf8Lengths=staticLengths;
408     } else {
409         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
410         if(utf8Lengths==NULL) {
411             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
412             return;  // Out of memory.
413         }
414     }
415 
416     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
417     utf8=spanLengths+stringsLength*4;
418     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
419 }
420 
~UnicodeSetStringSpan()421 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
422     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
423         delete pSpanNotSet;
424     }
425     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
426         uprv_free(utf8Lengths);
427     }
428 }
429 
addToSpanNotSet(UChar32 c)430 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
431     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
432         if(spanSet.contains(c)) {
433             return;  // Nothing to do.
434         }
435         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
436         if(newSet==NULL) {
437             return;  // Out of memory.
438         } else {
439             pSpanNotSet=newSet;
440         }
441     }
442     pSpanNotSet->add(c);
443 }
444 
445 // Compare strings without any argument checks. Requires length>0.
446 static inline UBool
matches16(const UChar * s,const UChar * t,int32_t length)447 matches16(const UChar *s, const UChar *t, int32_t length) {
448     do {
449         if(*s++!=*t++) {
450             return FALSE;
451         }
452     } while(--length>0);
453     return TRUE;
454 }
455 
456 static inline UBool
matches8(const uint8_t * s,const uint8_t * t,int32_t length)457 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
458     do {
459         if(*s++!=*t++) {
460             return FALSE;
461         }
462     } while(--length>0);
463     return TRUE;
464 }
465 
466 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
467 // at code point boundaries.
468 // That is, each edge of a match must not be in the middle of a surrogate pair.
469 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UChar * t,int32_t length)470 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
471     s+=start;
472     limit-=start;
473     return matches16(s, t, length) &&
474            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
475            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
476 }
477 
478 // Does the set contain the next code point?
479 // If so, return its length; otherwise return its negative length.
480 static inline int32_t
spanOne(const UnicodeSet & set,const UChar * s,int32_t length)481 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
482     UChar c=*s, c2;
483     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
484         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
485     }
486     return set.contains(c) ? 1 : -1;
487 }
488 
489 static inline int32_t
spanOneBack(const UnicodeSet & set,const UChar * s,int32_t length)490 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
491     UChar c=s[length-1], c2;
492     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
493         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
494     }
495     return set.contains(c) ? 1 : -1;
496 }
497 
498 static inline int32_t
spanOneUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)499 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
500     UChar32 c=*s;
501     if((int8_t)c>=0) {
502         return set.contains(c) ? 1 : -1;
503     }
504     // Take advantage of non-ASCII fastpaths in U8_NEXT().
505     int32_t i=0;
506     U8_NEXT(s, i, length, c);
507     return set.contains(c) ? i : -i;
508 }
509 
510 static inline int32_t
spanOneBackUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)511 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
512     UChar32 c=s[length-1];
513     if((int8_t)c>=0) {
514         return set.contains(c) ? 1 : -1;
515     }
516     int32_t i=length-1;
517     c=utf8_prevCharSafeBody(s, 0, &i, c, -1);
518     length-=i;
519     return set.contains(c) ? length : -length;
520 }
521 
522 /*
523  * Note: In span() when spanLength==0 (after a string match, or at the beginning
524  * after an empty code point span) and in spanNot() and spanNotUTF8(),
525  * string matching could use a binary search
526  * because all string matches are done from the same start index.
527  *
528  * For UTF-8, this would require a comparison function that returns UTF-16 order.
529  *
530  * This optimization should not be necessary for normal UnicodeSets because
531  * most sets have no strings, and most sets with strings have
532  * very few very short strings.
533  * For cases with many strings, it might be better to use a different API
534  * and implementation with a DFA (state machine).
535  */
536 
537 /*
538  * Algorithm for span(USET_SPAN_CONTAINED)
539  *
540  * Theoretical algorithm:
541  * - Iterate through the string, and at each code point boundary:
542  *   + If the code point there is in the set, then remember to continue after it.
543  *   + If a set string matches at the current position, then remember to continue after it.
544  *   + Either recursively span for each code point or string match,
545  *     or recursively span for all but the shortest one and
546  *     iteratively continue the span with the shortest local match.
547  *   + Remember the longest recursive span (the farthest end point).
548  *   + If there is no match at the current position, neither for the code point there
549  *     nor for any set string, then stop and return the longest recursive span length.
550  *
551  * Optimized implementation:
552  *
553  * (We assume that most sets will have very few very short strings.
554  * A span using a string-less set is extremely fast.)
555  *
556  * Create and cache a spanSet which contains all of the single code points
557  * of the original set but none of its strings.
558  *
559  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
560  * - Loop:
561  *   + Try to match each set string at the end of the spanLength.
562  *     ~ Set strings that start with set-contained code points must be matched
563  *       with a partial overlap because the recursive algorithm would have tried
564  *       to match them at every position.
565  *     ~ Set strings that entirely consist of set-contained code points
566  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
567  *       recursive algorithm would continue after them anyway
568  *       and find the longest recursive match from their end.
569  *     ~ Rather than recursing, note each end point of a set string match.
570  *   + If no set string matched after spanSet.span(), then return
571  *     with where the spanSet.span() ended.
572  *   + If at least one set string matched after spanSet.span(), then
573  *     pop the shortest string match end point and continue
574  *     the loop, trying to match all set strings from there.
575  *   + If at least one more set string matched after a previous string match,
576  *     then test if the code point after the previous string match is also
577  *     contained in the set.
578  *     Continue the loop with the shortest end point of either this code point
579  *     or a matching set string.
580  *   + If no more set string matched after a previous string match,
581  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
582  *     Stop if spanLength==0, otherwise continue the loop.
583  *
584  * By noting each end point of a set string match,
585  * the function visits each string position at most once and finishes
586  * in linear time.
587  *
588  * The recursive algorithm may visit the same string position many times
589  * if multiple paths lead to it and finishes in exponential time.
590  */
591 
592 /*
593  * Algorithm for span(USET_SPAN_SIMPLE)
594  *
595  * Theoretical algorithm:
596  * - Iterate through the string, and at each code point boundary:
597  *   + If the code point there is in the set, then remember to continue after it.
598  *   + If a set string matches at the current position, then remember to continue after it.
599  *   + Continue from the farthest match position and ignore all others.
600  *   + If there is no match at the current position,
601  *     then stop and return the current position.
602  *
603  * Optimized implementation:
604  *
605  * (Same assumption and spanSet as above.)
606  *
607  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
608  * - Loop:
609  *   + Try to match each set string at the end of the spanLength.
610  *     ~ Set strings that start with set-contained code points must be matched
611  *       with a partial overlap because the standard algorithm would have tried
612  *       to match them earlier.
613  *     ~ Set strings that entirely consist of set-contained code points
614  *       must be matched with a full overlap because the longest-match algorithm
615  *       would hide set string matches that end earlier.
616  *       Such set strings need not be matched earlier inside the code point span
617  *       because the standard algorithm would then have continued after
618  *       the set string match anyway.
619  *     ~ Remember the longest set string match (farthest end point) from the earliest
620  *       starting point.
621  *   + If no set string matched after spanSet.span(), then return
622  *     with where the spanSet.span() ended.
623  *   + If at least one set string matched, then continue the loop after the
624  *     longest match from the earliest position.
625  *   + If no more set string matched after a previous string match,
626  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
627  *     Stop if spanLength==0, otherwise continue the loop.
628  */
629 
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const630 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
631     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
632         return spanNot(s, length);
633     }
634     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
635     if(spanLength==length) {
636         return length;
637     }
638 
639     // Consider strings; they may overlap with the span.
640     OffsetList offsets;
641     if(spanCondition==USET_SPAN_CONTAINED) {
642         // Use offset list to try all possibilities.
643         offsets.setMaxLength(maxLength16);
644     }
645     int32_t pos=spanLength, rest=length-pos;
646     int32_t i, stringsLength=strings.size();
647     for(;;) {
648         if(spanCondition==USET_SPAN_CONTAINED) {
649             for(i=0; i<stringsLength; ++i) {
650                 int32_t overlap=spanLengths[i];
651                 if(overlap==ALL_CP_CONTAINED) {
652                     continue;  // Irrelevant string.
653                 }
654                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
655                 const UChar *s16=string.getBuffer();
656                 int32_t length16=string.length();
657 
658                 // Try to match this string at pos-overlap..pos.
659                 if(overlap>=LONG_SPAN) {
660                     overlap=length16;
661                     // While contained: No point matching fully inside the code point span.
662                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
663                 }
664                 if(overlap>spanLength) {
665                     overlap=spanLength;
666                 }
667                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
668                 for(;;) {
669                     if(inc>rest) {
670                         break;
671                     }
672                     // Try to match if the increment is not listed already.
673                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
674                         if(inc==rest) {
675                             return length;  // Reached the end of the string.
676                         }
677                         offsets.addOffset(inc);
678                     }
679                     if(overlap==0) {
680                         break;
681                     }
682                     --overlap;
683                     ++inc;
684                 }
685             }
686         } else /* USET_SPAN_SIMPLE */ {
687             int32_t maxInc=0, maxOverlap=0;
688             for(i=0; i<stringsLength; ++i) {
689                 int32_t overlap=spanLengths[i];
690                 // For longest match, we do need to try to match even an all-contained string
691                 // to find the match from the earliest start.
692 
693                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
694                 const UChar *s16=string.getBuffer();
695                 int32_t length16=string.length();
696 
697                 // Try to match this string at pos-overlap..pos.
698                 if(overlap>=LONG_SPAN) {
699                     overlap=length16;
700                     // Longest match: Need to match fully inside the code point span
701                     // to find the match from the earliest start.
702                 }
703                 if(overlap>spanLength) {
704                     overlap=spanLength;
705                 }
706                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
707                 for(;;) {
708                     if(inc>rest || overlap<maxOverlap) {
709                         break;
710                     }
711                     // Try to match if the string is longer or starts earlier.
712                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
713                         matches16CPB(s, pos-overlap, length, s16, length16)
714                     ) {
715                         maxInc=inc;  // Longest match from earliest start.
716                         maxOverlap=overlap;
717                         break;
718                     }
719                     --overlap;
720                     ++inc;
721                 }
722             }
723 
724             if(maxInc!=0 || maxOverlap!=0) {
725                 // Longest-match algorithm, and there was a string match.
726                 // Simply continue after it.
727                 pos+=maxInc;
728                 rest-=maxInc;
729                 if(rest==0) {
730                     return length;  // Reached the end of the string.
731                 }
732                 spanLength=0;  // Match strings from after a string match.
733                 continue;
734             }
735         }
736         // Finished trying to match all strings at pos.
737 
738         if(spanLength!=0 || pos==0) {
739             // The position is after an unlimited code point span (spanLength!=0),
740             // not after a string match.
741             // The only position where spanLength==0 after a span is pos==0.
742             // Otherwise, an unlimited code point span is only tried again when no
743             // strings match, and if such a non-initial span fails we stop.
744             if(offsets.isEmpty()) {
745                 return pos;  // No strings matched after a span.
746             }
747             // Match strings from after the next string match.
748         } else {
749             // The position is after a string match (or a single code point).
750             if(offsets.isEmpty()) {
751                 // No more strings matched after a previous string match.
752                 // Try another code point span from after the last string match.
753                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
754                 if( spanLength==rest || // Reached the end of the string, or
755                     spanLength==0       // neither strings nor span progressed.
756                 ) {
757                     return pos+spanLength;
758                 }
759                 pos+=spanLength;
760                 rest-=spanLength;
761                 continue;  // spanLength>0: Match strings from after a span.
762             } else {
763                 // Try to match only one code point from after a string match if some
764                 // string matched beyond it, so that we try all possible positions
765                 // and don't overshoot.
766                 spanLength=spanOne(spanSet, s+pos, rest);
767                 if(spanLength>0) {
768                     if(spanLength==rest) {
769                         return length;  // Reached the end of the string.
770                     }
771                     // Match strings after this code point.
772                     // There cannot be any increments below it because UnicodeSet strings
773                     // contain multiple code points.
774                     pos+=spanLength;
775                     rest-=spanLength;
776                     offsets.shift(spanLength);
777                     spanLength=0;
778                     continue;  // Match strings from after a single code point.
779                 }
780                 // Match strings from after the next string match.
781             }
782         }
783         int32_t minOffset=offsets.popMinimum();
784         pos+=minOffset;
785         rest-=minOffset;
786         spanLength=0;  // Match strings from after a string match.
787     }
788 }
789 
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const790 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
791     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
792         return spanNotBack(s, length);
793     }
794     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
795     if(pos==0) {
796         return 0;
797     }
798     int32_t spanLength=length-pos;
799 
800     // Consider strings; they may overlap with the span.
801     OffsetList offsets;
802     if(spanCondition==USET_SPAN_CONTAINED) {
803         // Use offset list to try all possibilities.
804         offsets.setMaxLength(maxLength16);
805     }
806     int32_t i, stringsLength=strings.size();
807     uint8_t *spanBackLengths=spanLengths;
808     if(all) {
809         spanBackLengths+=stringsLength;
810     }
811     for(;;) {
812         if(spanCondition==USET_SPAN_CONTAINED) {
813             for(i=0; i<stringsLength; ++i) {
814                 int32_t overlap=spanBackLengths[i];
815                 if(overlap==ALL_CP_CONTAINED) {
816                     continue;  // Irrelevant string.
817                 }
818                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
819                 const UChar *s16=string.getBuffer();
820                 int32_t length16=string.length();
821 
822                 // Try to match this string at pos-(length16-overlap)..pos-length16.
823                 if(overlap>=LONG_SPAN) {
824                     overlap=length16;
825                     // While contained: No point matching fully inside the code point span.
826                     int32_t len1=0;
827                     U16_FWD_1(s16, len1, overlap);
828                     overlap-=len1;  // Length of the string minus the first code point.
829                 }
830                 if(overlap>spanLength) {
831                     overlap=spanLength;
832                 }
833                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
834                 for(;;) {
835                     if(dec>pos) {
836                         break;
837                     }
838                     // Try to match if the decrement is not listed already.
839                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
840                         if(dec==pos) {
841                             return 0;  // Reached the start of the string.
842                         }
843                         offsets.addOffset(dec);
844                     }
845                     if(overlap==0) {
846                         break;
847                     }
848                     --overlap;
849                     ++dec;
850                 }
851             }
852         } else /* USET_SPAN_SIMPLE */ {
853             int32_t maxDec=0, maxOverlap=0;
854             for(i=0; i<stringsLength; ++i) {
855                 int32_t overlap=spanBackLengths[i];
856                 // For longest match, we do need to try to match even an all-contained string
857                 // to find the match from the latest end.
858 
859                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
860                 const UChar *s16=string.getBuffer();
861                 int32_t length16=string.length();
862 
863                 // Try to match this string at pos-(length16-overlap)..pos-length16.
864                 if(overlap>=LONG_SPAN) {
865                     overlap=length16;
866                     // Longest match: Need to match fully inside the code point span
867                     // to find the match from the latest end.
868                 }
869                 if(overlap>spanLength) {
870                     overlap=spanLength;
871                 }
872                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
873                 for(;;) {
874                     if(dec>pos || overlap<maxOverlap) {
875                         break;
876                     }
877                     // Try to match if the string is longer or ends later.
878                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
879                         matches16CPB(s, pos-dec, length, s16, length16)
880                     ) {
881                         maxDec=dec;  // Longest match from latest end.
882                         maxOverlap=overlap;
883                         break;
884                     }
885                     --overlap;
886                     ++dec;
887                 }
888             }
889 
890             if(maxDec!=0 || maxOverlap!=0) {
891                 // Longest-match algorithm, and there was a string match.
892                 // Simply continue before it.
893                 pos-=maxDec;
894                 if(pos==0) {
895                     return 0;  // Reached the start of the string.
896                 }
897                 spanLength=0;  // Match strings from before a string match.
898                 continue;
899             }
900         }
901         // Finished trying to match all strings at pos.
902 
903         if(spanLength!=0 || pos==length) {
904             // The position is before an unlimited code point span (spanLength!=0),
905             // not before a string match.
906             // The only position where spanLength==0 before a span is pos==length.
907             // Otherwise, an unlimited code point span is only tried again when no
908             // strings match, and if such a non-initial span fails we stop.
909             if(offsets.isEmpty()) {
910                 return pos;  // No strings matched before a span.
911             }
912             // Match strings from before the next string match.
913         } else {
914             // The position is before a string match (or a single code point).
915             if(offsets.isEmpty()) {
916                 // No more strings matched before a previous string match.
917                 // Try another code point span from before the last string match.
918                 int32_t oldPos=pos;
919                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
920                 spanLength=oldPos-pos;
921                 if( pos==0 ||           // Reached the start of the string, or
922                     spanLength==0       // neither strings nor span progressed.
923                 ) {
924                     return pos;
925                 }
926                 continue;  // spanLength>0: Match strings from before a span.
927             } else {
928                 // Try to match only one code point from before a string match if some
929                 // string matched beyond it, so that we try all possible positions
930                 // and don't overshoot.
931                 spanLength=spanOneBack(spanSet, s, pos);
932                 if(spanLength>0) {
933                     if(spanLength==pos) {
934                         return 0;  // Reached the start of the string.
935                     }
936                     // Match strings before this code point.
937                     // There cannot be any decrements below it because UnicodeSet strings
938                     // contain multiple code points.
939                     pos-=spanLength;
940                     offsets.shift(spanLength);
941                     spanLength=0;
942                     continue;  // Match strings from before a single code point.
943                 }
944                 // Match strings from before the next string match.
945             }
946         }
947         pos-=offsets.popMinimum();
948         spanLength=0;  // Match strings from before a string match.
949     }
950 }
951 
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const952 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
953     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
954         return spanNotUTF8(s, length);
955     }
956     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
957     if(spanLength==length) {
958         return length;
959     }
960 
961     // Consider strings; they may overlap with the span.
962     OffsetList offsets;
963     if(spanCondition==USET_SPAN_CONTAINED) {
964         // Use offset list to try all possibilities.
965         offsets.setMaxLength(maxLength8);
966     }
967     int32_t pos=spanLength, rest=length-pos;
968     int32_t i, stringsLength=strings.size();
969     uint8_t *spanUTF8Lengths=spanLengths;
970     if(all) {
971         spanUTF8Lengths+=2*stringsLength;
972     }
973     for(;;) {
974         const uint8_t *s8=utf8;
975         int32_t length8;
976         if(spanCondition==USET_SPAN_CONTAINED) {
977             for(i=0; i<stringsLength; ++i) {
978                 length8=utf8Lengths[i];
979                 if(length8==0) {
980                     continue;  // String not representable in UTF-8.
981                 }
982                 int32_t overlap=spanUTF8Lengths[i];
983                 if(overlap==ALL_CP_CONTAINED) {
984                     s8+=length8;
985                     continue;  // Irrelevant string.
986                 }
987 
988                 // Try to match this string at pos-overlap..pos.
989                 if(overlap>=LONG_SPAN) {
990                     overlap=length8;
991                     // While contained: No point matching fully inside the code point span.
992                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
993                 }
994                 if(overlap>spanLength) {
995                     overlap=spanLength;
996                 }
997                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
998                 for(;;) {
999                     if(inc>rest) {
1000                         break;
1001                     }
1002                     // Try to match if the increment is not listed already.
1003                     // Match at code point boundaries. (The UTF-8 strings were converted
1004                     // from UTF-16 and are guaranteed to be well-formed.)
1005                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
1006                         !offsets.containsOffset(inc) &&
1007                         matches8(s+pos-overlap, s8, length8)
1008 
1009                     ) {
1010                         if(inc==rest) {
1011                             return length;  // Reached the end of the string.
1012                         }
1013                         offsets.addOffset(inc);
1014                     }
1015                     if(overlap==0) {
1016                         break;
1017                     }
1018                     --overlap;
1019                     ++inc;
1020                 }
1021                 s8+=length8;
1022             }
1023         } else /* USET_SPAN_SIMPLE */ {
1024             int32_t maxInc=0, maxOverlap=0;
1025             for(i=0; i<stringsLength; ++i) {
1026                 length8=utf8Lengths[i];
1027                 if(length8==0) {
1028                     continue;  // String not representable in UTF-8.
1029                 }
1030                 int32_t overlap=spanUTF8Lengths[i];
1031                 // For longest match, we do need to try to match even an all-contained string
1032                 // to find the match from the earliest start.
1033 
1034                 // Try to match this string at pos-overlap..pos.
1035                 if(overlap>=LONG_SPAN) {
1036                     overlap=length8;
1037                     // Longest match: Need to match fully inside the code point span
1038                     // to find the match from the earliest start.
1039                 }
1040                 if(overlap>spanLength) {
1041                     overlap=spanLength;
1042                 }
1043                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1044                 for(;;) {
1045                     if(inc>rest || overlap<maxOverlap) {
1046                         break;
1047                     }
1048                     // Try to match if the string is longer or starts earlier.
1049                     // Match at code point boundaries. (The UTF-8 strings were converted
1050                     // from UTF-16 and are guaranteed to be well-formed.)
1051                     if( !U8_IS_TRAIL(s[pos-overlap]) &&
1052                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1053                         matches8(s+pos-overlap, s8, length8)
1054 
1055                     ) {
1056                         maxInc=inc;  // Longest match from earliest start.
1057                         maxOverlap=overlap;
1058                         break;
1059                     }
1060                     --overlap;
1061                     ++inc;
1062                 }
1063                 s8+=length8;
1064             }
1065 
1066             if(maxInc!=0 || maxOverlap!=0) {
1067                 // Longest-match algorithm, and there was a string match.
1068                 // Simply continue after it.
1069                 pos+=maxInc;
1070                 rest-=maxInc;
1071                 if(rest==0) {
1072                     return length;  // Reached the end of the string.
1073                 }
1074                 spanLength=0;  // Match strings from after a string match.
1075                 continue;
1076             }
1077         }
1078         // Finished trying to match all strings at pos.
1079 
1080         if(spanLength!=0 || pos==0) {
1081             // The position is after an unlimited code point span (spanLength!=0),
1082             // not after a string match.
1083             // The only position where spanLength==0 after a span is pos==0.
1084             // Otherwise, an unlimited code point span is only tried again when no
1085             // strings match, and if such a non-initial span fails we stop.
1086             if(offsets.isEmpty()) {
1087                 return pos;  // No strings matched after a span.
1088             }
1089             // Match strings from after the next string match.
1090         } else {
1091             // The position is after a string match (or a single code point).
1092             if(offsets.isEmpty()) {
1093                 // No more strings matched after a previous string match.
1094                 // Try another code point span from after the last string match.
1095                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1096                 if( spanLength==rest || // Reached the end of the string, or
1097                     spanLength==0       // neither strings nor span progressed.
1098                 ) {
1099                     return pos+spanLength;
1100                 }
1101                 pos+=spanLength;
1102                 rest-=spanLength;
1103                 continue;  // spanLength>0: Match strings from after a span.
1104             } else {
1105                 // Try to match only one code point from after a string match if some
1106                 // string matched beyond it, so that we try all possible positions
1107                 // and don't overshoot.
1108                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1109                 if(spanLength>0) {
1110                     if(spanLength==rest) {
1111                         return length;  // Reached the end of the string.
1112                     }
1113                     // Match strings after this code point.
1114                     // There cannot be any increments below it because UnicodeSet strings
1115                     // contain multiple code points.
1116                     pos+=spanLength;
1117                     rest-=spanLength;
1118                     offsets.shift(spanLength);
1119                     spanLength=0;
1120                     continue;  // Match strings from after a single code point.
1121                 }
1122                 // Match strings from after the next string match.
1123             }
1124         }
1125         int32_t minOffset=offsets.popMinimum();
1126         pos+=minOffset;
1127         rest-=minOffset;
1128         spanLength=0;  // Match strings from after a string match.
1129     }
1130 }
1131 
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const1132 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1133     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1134         return spanNotBackUTF8(s, length);
1135     }
1136     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1137     if(pos==0) {
1138         return 0;
1139     }
1140     int32_t spanLength=length-pos;
1141 
1142     // Consider strings; they may overlap with the span.
1143     OffsetList offsets;
1144     if(spanCondition==USET_SPAN_CONTAINED) {
1145         // Use offset list to try all possibilities.
1146         offsets.setMaxLength(maxLength8);
1147     }
1148     int32_t i, stringsLength=strings.size();
1149     uint8_t *spanBackUTF8Lengths=spanLengths;
1150     if(all) {
1151         spanBackUTF8Lengths+=3*stringsLength;
1152     }
1153     for(;;) {
1154         const uint8_t *s8=utf8;
1155         int32_t length8;
1156         if(spanCondition==USET_SPAN_CONTAINED) {
1157             for(i=0; i<stringsLength; ++i) {
1158                 length8=utf8Lengths[i];
1159                 if(length8==0) {
1160                     continue;  // String not representable in UTF-8.
1161                 }
1162                 int32_t overlap=spanBackUTF8Lengths[i];
1163                 if(overlap==ALL_CP_CONTAINED) {
1164                     s8+=length8;
1165                     continue;  // Irrelevant string.
1166                 }
1167 
1168                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1169                 if(overlap>=LONG_SPAN) {
1170                     overlap=length8;
1171                     // While contained: No point matching fully inside the code point span.
1172                     int32_t len1=0;
1173                     U8_FWD_1(s8, len1, overlap);
1174                     overlap-=len1;  // Length of the string minus the first code point.
1175                 }
1176                 if(overlap>spanLength) {
1177                     overlap=spanLength;
1178                 }
1179                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1180                 for(;;) {
1181                     if(dec>pos) {
1182                         break;
1183                     }
1184                     // Try to match if the decrement is not listed already.
1185                     // Match at code point boundaries. (The UTF-8 strings were converted
1186                     // from UTF-16 and are guaranteed to be well-formed.)
1187                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1188                         !offsets.containsOffset(dec) &&
1189                         matches8(s+pos-dec, s8, length8)
1190                     ) {
1191                         if(dec==pos) {
1192                             return 0;  // Reached the start of the string.
1193                         }
1194                         offsets.addOffset(dec);
1195                     }
1196                     if(overlap==0) {
1197                         break;
1198                     }
1199                     --overlap;
1200                     ++dec;
1201                 }
1202                 s8+=length8;
1203             }
1204         } else /* USET_SPAN_SIMPLE */ {
1205             int32_t maxDec=0, maxOverlap=0;
1206             for(i=0; i<stringsLength; ++i) {
1207                 length8=utf8Lengths[i];
1208                 if(length8==0) {
1209                     continue;  // String not representable in UTF-8.
1210                 }
1211                 int32_t overlap=spanBackUTF8Lengths[i];
1212                 // For longest match, we do need to try to match even an all-contained string
1213                 // to find the match from the latest end.
1214 
1215                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1216                 if(overlap>=LONG_SPAN) {
1217                     overlap=length8;
1218                     // Longest match: Need to match fully inside the code point span
1219                     // to find the match from the latest end.
1220                 }
1221                 if(overlap>spanLength) {
1222                     overlap=spanLength;
1223                 }
1224                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1225                 for(;;) {
1226                     if(dec>pos || overlap<maxOverlap) {
1227                         break;
1228                     }
1229                     // Try to match if the string is longer or ends later.
1230                     // Match at code point boundaries. (The UTF-8 strings were converted
1231                     // from UTF-16 and are guaranteed to be well-formed.)
1232                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1233                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1234                         matches8(s+pos-dec, s8, length8)
1235                     ) {
1236                         maxDec=dec;  // Longest match from latest end.
1237                         maxOverlap=overlap;
1238                         break;
1239                     }
1240                     --overlap;
1241                     ++dec;
1242                 }
1243                 s8+=length8;
1244             }
1245 
1246             if(maxDec!=0 || maxOverlap!=0) {
1247                 // Longest-match algorithm, and there was a string match.
1248                 // Simply continue before it.
1249                 pos-=maxDec;
1250                 if(pos==0) {
1251                     return 0;  // Reached the start of the string.
1252                 }
1253                 spanLength=0;  // Match strings from before a string match.
1254                 continue;
1255             }
1256         }
1257         // Finished trying to match all strings at pos.
1258 
1259         if(spanLength!=0 || pos==length) {
1260             // The position is before an unlimited code point span (spanLength!=0),
1261             // not before a string match.
1262             // The only position where spanLength==0 before a span is pos==length.
1263             // Otherwise, an unlimited code point span is only tried again when no
1264             // strings match, and if such a non-initial span fails we stop.
1265             if(offsets.isEmpty()) {
1266                 return pos;  // No strings matched before a span.
1267             }
1268             // Match strings from before the next string match.
1269         } else {
1270             // The position is before a string match (or a single code point).
1271             if(offsets.isEmpty()) {
1272                 // No more strings matched before a previous string match.
1273                 // Try another code point span from before the last string match.
1274                 int32_t oldPos=pos;
1275                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1276                 spanLength=oldPos-pos;
1277                 if( pos==0 ||           // Reached the start of the string, or
1278                     spanLength==0       // neither strings nor span progressed.
1279                 ) {
1280                     return pos;
1281                 }
1282                 continue;  // spanLength>0: Match strings from before a span.
1283             } else {
1284                 // Try to match only one code point from before a string match if some
1285                 // string matched beyond it, so that we try all possible positions
1286                 // and don't overshoot.
1287                 spanLength=spanOneBackUTF8(spanSet, s, pos);
1288                 if(spanLength>0) {
1289                     if(spanLength==pos) {
1290                         return 0;  // Reached the start of the string.
1291                     }
1292                     // Match strings before this code point.
1293                     // There cannot be any decrements below it because UnicodeSet strings
1294                     // contain multiple code points.
1295                     pos-=spanLength;
1296                     offsets.shift(spanLength);
1297                     spanLength=0;
1298                     continue;  // Match strings from before a single code point.
1299                 }
1300                 // Match strings from before the next string match.
1301             }
1302         }
1303         pos-=offsets.popMinimum();
1304         spanLength=0;  // Match strings from before a string match.
1305     }
1306 }
1307 
1308 /*
1309  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1310  *
1311  * Theoretical algorithm:
1312  * - Iterate through the string, and at each code point boundary:
1313  *   + If the code point there is in the set, then return with the current position.
1314  *   + If a set string matches at the current position, then return with the current position.
1315  *
1316  * Optimized implementation:
1317  *
1318  * (Same assumption as for span() above.)
1319  *
1320  * Create and cache a spanNotSet which contains all of the single code points
1321  * of the original set but none of its strings.
1322  * For each set string add its initial code point to the spanNotSet.
1323  * (Also add its final code point for spanNotBack().)
1324  *
1325  * - Loop:
1326  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1327  *   + If the current code point is in the original set, then
1328  *     return the current position.
1329  *   + If any set string matches at the current position, then
1330  *     return the current position.
1331  *   + If there is no match at the current position, neither for the code point there
1332  *     nor for any set string, then skip this code point and continue the loop.
1333  *     This happens for set-string-initial code points that were added to spanNotSet
1334  *     when there is not actually a match for such a set string.
1335  */
1336 
spanNot(const UChar * s,int32_t length) const1337 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1338     int32_t pos=0, rest=length;
1339     int32_t i, stringsLength=strings.size();
1340     do {
1341         // Span until we find a code point from the set,
1342         // or a code point that starts or ends some string.
1343         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1344         if(i==rest) {
1345             return length;  // Reached the end of the string.
1346         }
1347         pos+=i;
1348         rest-=i;
1349 
1350         // Check whether the current code point is in the original set,
1351         // without the string starts and ends.
1352         int32_t cpLength=spanOne(spanSet, s+pos, rest);
1353         if(cpLength>0) {
1354             return pos;  // There is a set element at pos.
1355         }
1356 
1357         // Try to match the strings at pos.
1358         for(i=0; i<stringsLength; ++i) {
1359             if(spanLengths[i]==ALL_CP_CONTAINED) {
1360                 continue;  // Irrelevant string.
1361             }
1362             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1363             const UChar *s16=string.getBuffer();
1364             int32_t length16=string.length();
1365             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1366                 return pos;  // There is a set element at pos.
1367             }
1368         }
1369 
1370         // The span(while not contained) ended on a string start/end which is
1371         // not in the original set. Skip this code point and continue.
1372         // cpLength<0
1373         pos-=cpLength;
1374         rest+=cpLength;
1375     } while(rest!=0);
1376     return length;  // Reached the end of the string.
1377 }
1378 
spanNotBack(const UChar * s,int32_t length) const1379 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1380     int32_t pos=length;
1381     int32_t i, stringsLength=strings.size();
1382     do {
1383         // Span until we find a code point from the set,
1384         // or a code point that starts or ends some string.
1385         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1386         if(pos==0) {
1387             return 0;  // Reached the start of the string.
1388         }
1389 
1390         // Check whether the current code point is in the original set,
1391         // without the string starts and ends.
1392         int32_t cpLength=spanOneBack(spanSet, s, pos);
1393         if(cpLength>0) {
1394             return pos;  // There is a set element at pos.
1395         }
1396 
1397         // Try to match the strings at pos.
1398         for(i=0; i<stringsLength; ++i) {
1399             // Use spanLengths rather than a spanBackLengths pointer because
1400             // it is easier and we only need to know whether the string is irrelevant
1401             // which is the same in either array.
1402             if(spanLengths[i]==ALL_CP_CONTAINED) {
1403                 continue;  // Irrelevant string.
1404             }
1405             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1406             const UChar *s16=string.getBuffer();
1407             int32_t length16=string.length();
1408             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1409                 return pos;  // There is a set element at pos.
1410             }
1411         }
1412 
1413         // The span(while not contained) ended on a string start/end which is
1414         // not in the original set. Skip this code point and continue.
1415         // cpLength<0
1416         pos+=cpLength;
1417     } while(pos!=0);
1418     return 0;  // Reached the start of the string.
1419 }
1420 
spanNotUTF8(const uint8_t * s,int32_t length) const1421 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1422     int32_t pos=0, rest=length;
1423     int32_t i, stringsLength=strings.size();
1424     uint8_t *spanUTF8Lengths=spanLengths;
1425     if(all) {
1426         spanUTF8Lengths+=2*stringsLength;
1427     }
1428     do {
1429         // Span until we find a code point from the set,
1430         // or a code point that starts or ends some string.
1431         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1432         if(i==rest) {
1433             return length;  // Reached the end of the string.
1434         }
1435         pos+=i;
1436         rest-=i;
1437 
1438         // Check whether the current code point is in the original set,
1439         // without the string starts and ends.
1440         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1441         if(cpLength>0) {
1442             return pos;  // There is a set element at pos.
1443         }
1444 
1445         // Try to match the strings at pos.
1446         const uint8_t *s8=utf8;
1447         int32_t length8;
1448         for(i=0; i<stringsLength; ++i) {
1449             length8=utf8Lengths[i];
1450             // ALL_CP_CONTAINED: Irrelevant string.
1451             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1452                 return pos;  // There is a set element at pos.
1453             }
1454             s8+=length8;
1455         }
1456 
1457         // The span(while not contained) ended on a string start/end which is
1458         // not in the original set. Skip this code point and continue.
1459         // cpLength<0
1460         pos-=cpLength;
1461         rest+=cpLength;
1462     } while(rest!=0);
1463     return length;  // Reached the end of the string.
1464 }
1465 
spanNotBackUTF8(const uint8_t * s,int32_t length) const1466 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1467     int32_t pos=length;
1468     int32_t i, stringsLength=strings.size();
1469     uint8_t *spanBackUTF8Lengths=spanLengths;
1470     if(all) {
1471         spanBackUTF8Lengths+=3*stringsLength;
1472     }
1473     do {
1474         // Span until we find a code point from the set,
1475         // or a code point that starts or ends some string.
1476         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1477         if(pos==0) {
1478             return 0;  // Reached the start of the string.
1479         }
1480 
1481         // Check whether the current code point is in the original set,
1482         // without the string starts and ends.
1483         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1484         if(cpLength>0) {
1485             return pos;  // There is a set element at pos.
1486         }
1487 
1488         // Try to match the strings at pos.
1489         const uint8_t *s8=utf8;
1490         int32_t length8;
1491         for(i=0; i<stringsLength; ++i) {
1492             length8=utf8Lengths[i];
1493             // ALL_CP_CONTAINED: Irrelevant string.
1494             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1495                 return pos;  // There is a set element at pos.
1496             }
1497             s8+=length8;
1498         }
1499 
1500         // The span(while not contained) ended on a string start/end which is
1501         // not in the original set. Skip this code point and continue.
1502         // cpLength<0
1503         pos+=cpLength;
1504     } while(pos!=0);
1505     return 0;  // Reached the start of the string.
1506 }
1507 
1508 U_NAMESPACE_END
1509