• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 2007-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 *   file name:  unisetspan.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2007mar01
16 *   created by: Markus W. Scherer
17 */
18 
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/ustring.h"
22 #include "unicode/utf8.h"
23 #include "unicode/utf16.h"
24 #include "cmemory.h"
25 #include "uvector.h"
26 #include "unisetspan.h"
27 
28 U_NAMESPACE_BEGIN
29 
30 /*
31  * List of offsets from the current position from where to try matching
32  * a code point or a string.
33  * Store offsets rather than indexes to simplify the code and use the same list
34  * for both increments (in span()) and decrements (in spanBack()).
35  *
36  * Assumption: The maximum offset is limited, and the offsets that are stored
37  * at any one time are relatively dense, that is, there are normally no gaps of
38  * hundreds or thousands of offset values.
39  *
40  * The implementation uses a circular buffer of byte flags,
41  * each indicating whether the corresponding offset is in the list.
42  * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43  * physically moving part of the list.
44  *
45  * Note: In principle, the caller should setMaxLength() to the maximum of the
46  * max string length and U16_LENGTH/U8_LENGTH to account for
47  * "long" single code points.
48  * However, this implementation uses at least a staticList with more than
49  * U8_LENGTH entries anyway.
50  *
51  * Note: If maxLength were guaranteed to be no more than 32 or 64,
52  * the list could be stored as bit flags in a single integer.
53  * Rather than handling a circular buffer with a start list index,
54  * the integer would simply be shifted when lower offsets are removed.
55  * UnicodeSet does not have a limit on the lengths of strings.
56  */
57 class OffsetList {  // Only ever stack-allocated, does not need to inherit UMemory.
58 public:
OffsetList()59     OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60 
~OffsetList()61     ~OffsetList() {
62         if(list!=staticList) {
63             uprv_free(list);
64         }
65     }
66 
67     // Call exactly once if the list is to be used.
setMaxLength(int32_t maxLength)68     void setMaxLength(int32_t maxLength) {
69         if(maxLength<=(int32_t)sizeof(staticList)) {
70             capacity=(int32_t)sizeof(staticList);
71         } else {
72             UBool *l=(UBool *)uprv_malloc(maxLength);
73             if(l!=NULL) {
74                 list=l;
75                 capacity=maxLength;
76             }
77         }
78         uprv_memset(list, 0, capacity);
79     }
80 
clear()81     void clear() {
82         uprv_memset(list, 0, capacity);
83         start=length=0;
84     }
85 
isEmpty() const86     UBool isEmpty() const {
87         return (UBool)(length==0);
88     }
89 
90     // Reduce all stored offsets by delta, used when the current position
91     // moves by delta.
92     // There must not be any offsets lower than delta.
93     // If there is an offset equal to delta, it is removed.
94     // delta=[1..maxLength]
shift(int32_t delta)95     void shift(int32_t delta) {
96         int32_t i=start+delta;
97         if(i>=capacity) {
98             i-=capacity;
99         }
100         if(list[i]) {
101             list[i]=FALSE;
102             --length;
103         }
104         start=i;
105     }
106 
107     // Add an offset. The list must not contain it yet.
108     // offset=[1..maxLength]
addOffset(int32_t offset)109     void addOffset(int32_t offset) {
110         int32_t i=start+offset;
111         if(i>=capacity) {
112             i-=capacity;
113         }
114         list[i]=TRUE;
115         ++length;
116     }
117 
118     // offset=[1..maxLength]
containsOffset(int32_t offset) const119     UBool containsOffset(int32_t offset) const {
120         int32_t i=start+offset;
121         if(i>=capacity) {
122             i-=capacity;
123         }
124         return list[i];
125     }
126 
127     // Find the lowest stored offset from a non-empty list, remove it,
128     // and reduce all other offsets by this minimum.
129     // Returns [1..maxLength].
popMinimum()130     int32_t popMinimum() {
131         // Look for the next offset in list[start+1..capacity-1].
132         int32_t i=start, result;
133         while(++i<capacity) {
134             if(list[i]) {
135                 list[i]=FALSE;
136                 --length;
137                 result=i-start;
138                 start=i;
139                 return result;
140             }
141         }
142         // i==capacity
143 
144         // Wrap around and look for the next offset in list[0..start].
145         // Since the list is not empty, there will be one.
146         result=capacity-start;
147         i=0;
148         while(!list[i]) {
149             ++i;
150         }
151         list[i]=FALSE;
152         --length;
153         start=i;
154         return result+=i;
155     }
156 
157 private:
158     UBool *list;
159     int32_t capacity;
160     int32_t length;
161     int32_t start;
162 
163     UBool staticList[16];
164 };
165 
166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167 static int32_t
getUTF8Length(const UChar * s,int32_t length)168 getUTF8Length(const UChar *s, int32_t length) {
169     UErrorCode errorCode=U_ZERO_ERROR;
170     int32_t length8=0;
171     u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
172     if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173         return length8;
174     } else {
175         // The string contains an unpaired surrogate.
176         // Ignore this string.
177         return 0;
178     }
179 }
180 
181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182 static int32_t
appendUTF8(const UChar * s,int32_t length,uint8_t * t,int32_t capacity)183 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
184     UErrorCode errorCode=U_ZERO_ERROR;
185     int32_t length8=0;
186     u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
187     if(U_SUCCESS(errorCode)) {
188         return length8;
189     } else {
190         // The string contains an unpaired surrogate.
191         // Ignore this string.
192         return 0;
193     }
194 }
195 
196 static inline uint8_t
makeSpanLengthByte(int32_t spanLength)197 makeSpanLengthByte(int32_t spanLength) {
198     // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199     return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200 }
201 
202 // Construct for all variants of span(), or only for any one variant.
203 // Initialize as little as possible, for single use.
UnicodeSetStringSpan(const UnicodeSet & set,const UVector & setStrings,uint32_t which)204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205                                            const UVector &setStrings,
206                                            uint32_t which)
207         : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
208           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
209           utf8Length(0),
210           maxLength16(0), maxLength8(0),
211           all((UBool)(which==ALL)) {
212     spanSet.retainAll(set);
213     if(which&NOT_CONTAINED) {
214         // Default to the same sets.
215         // addToSpanNotSet() will create a separate set if necessary.
216         pSpanNotSet=&spanSet;
217     }
218 
219     // Determine if the strings even need to be taken into account at all for span() etc.
220     // If any string is relevant, then all strings need to be used for
221     // span(longest match) but only the relevant ones for span(while contained).
222     // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223     //   and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224     //   (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225     // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226     int32_t stringsLength=strings.size();
227 
228     int32_t i, spanLength;
229     UBool someRelevant=FALSE;
230     for(i=0; i<stringsLength; ++i) {
231         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
232         const UChar *s16=string.getBuffer();
233         int32_t length16=string.length();
234         UBool thisRelevant;
235         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
236         if(spanLength<length16) {  // Relevant string.
237             someRelevant=thisRelevant=TRUE;
238         } else {
239             thisRelevant=FALSE;
240         }
241         if((which&UTF16) && length16>maxLength16) {
242             maxLength16=length16;
243         }
244         if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
245             int32_t length8=getUTF8Length(s16, length16);
246             utf8Length+=length8;
247             if(length8>maxLength8) {
248                 maxLength8=length8;
249             }
250         }
251     }
252     if(!someRelevant) {
253         maxLength16=maxLength8=0;
254         return;
255     }
256 
257     // Freeze after checking for the need to use strings at all because freezing
258     // a set takes some time and memory which are wasted if there are no relevant strings.
259     if(all) {
260         spanSet.freeze();
261     }
262 
263     uint8_t *spanBackLengths;
264     uint8_t *spanUTF8Lengths;
265     uint8_t *spanBackUTF8Lengths;
266 
267     // Allocate a block of meta data.
268     int32_t allocSize;
269     if(all) {
270         // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
271         allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
272     } else {
273         allocSize=stringsLength;  // One set of span lengths.
274         if(which&UTF8) {
275             // UTF-8 lengths and UTF-8 strings.
276             allocSize+=stringsLength*4+utf8Length;
277         }
278     }
279     if(allocSize<=(int32_t)sizeof(staticLengths)) {
280         utf8Lengths=staticLengths;
281     } else {
282         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
283         if(utf8Lengths==NULL) {
284             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
285             return;  // Out of memory.
286         }
287     }
288 
289     if(all) {
290         // Store span lengths for all span() variants.
291         spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
292         spanBackLengths=spanLengths+stringsLength;
293         spanUTF8Lengths=spanBackLengths+stringsLength;
294         spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
295         utf8=spanBackUTF8Lengths+stringsLength;
296     } else {
297         // Store span lengths for only one span() variant.
298         if(which&UTF8) {
299             spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
300             utf8=spanLengths+stringsLength;
301         } else {
302             spanLengths=(uint8_t *)utf8Lengths;
303         }
304         spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
305     }
306 
307     // Set the meta data and pSpanNotSet and write the UTF-8 strings.
308     int32_t utf8Count=0;  // Count UTF-8 bytes written so far.
309 
310     for(i=0; i<stringsLength; ++i) {
311         const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
312         const UChar *s16=string.getBuffer();
313         int32_t length16=string.length();
314         spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
315         if(spanLength<length16) {  // Relevant string.
316             if(which&UTF16) {
317                 if(which&CONTAINED) {
318                     if(which&FWD) {
319                         spanLengths[i]=makeSpanLengthByte(spanLength);
320                     }
321                     if(which&BACK) {
322                         spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
323                         spanBackLengths[i]=makeSpanLengthByte(spanLength);
324                     }
325                 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
326                     spanLengths[i]=spanBackLengths[i]=0;  // Only store a relevant/irrelevant flag.
327                 }
328             }
329             if(which&UTF8) {
330                 uint8_t *s8=utf8+utf8Count;
331                 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
332                 utf8Count+=utf8Lengths[i]=length8;
333                 if(length8==0) {  // Irrelevant for UTF-8 because not representable in UTF-8.
334                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
335                 } else {  // Relevant for UTF-8.
336                     if(which&CONTAINED) {
337                         if(which&FWD) {
338                             spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
339                             spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
340                         }
341                         if(which&BACK) {
342                             spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
343                             spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
344                         }
345                     } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
346                         spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0;  // Only store a relevant/irrelevant flag.
347                     }
348                 }
349             }
350             if(which&NOT_CONTAINED) {
351                 // Add string start and end code points to the spanNotSet so that
352                 // a span(while not contained) stops before any string.
353                 UChar32 c;
354                 if(which&FWD) {
355                     int32_t len=0;
356                     U16_NEXT(s16, len, length16, c);
357                     addToSpanNotSet(c);
358                 }
359                 if(which&BACK) {
360                     int32_t len=length16;
361                     U16_PREV(s16, 0, len, c);
362                     addToSpanNotSet(c);
363                 }
364             }
365         } else {  // Irrelevant string.
366             if(which&UTF8) {
367                 if(which&CONTAINED) {  // Only necessary for LONGEST_MATCH.
368                     uint8_t *s8=utf8+utf8Count;
369                     int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
370                     utf8Count+=utf8Lengths[i]=length8;
371                 } else {
372                     utf8Lengths[i]=0;
373                 }
374             }
375             if(all) {
376                 spanLengths[i]=spanBackLengths[i]=
377                     spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
378                         (uint8_t)ALL_CP_CONTAINED;
379             } else {
380                 // All spanXYZLengths pointers contain the same address.
381                 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
382             }
383         }
384     }
385 
386     // Finish.
387     if(all) {
388         pSpanNotSet->freeze();
389     }
390 }
391 
392 // Copy constructor. Assumes which==ALL for a frozen set.
UnicodeSetStringSpan(const UnicodeSetStringSpan & otherStringSpan,const UVector & newParentSetStrings)393 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
394                                            const UVector &newParentSetStrings)
395         : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
396           utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
397           utf8Length(otherStringSpan.utf8Length),
398           maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
399           all(TRUE) {
400     if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
401         pSpanNotSet=&spanSet;
402     } else {
403         pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
404     }
405 
406     // Allocate a block of meta data.
407     // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
408     int32_t stringsLength=strings.size();
409     int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
410     if(allocSize<=(int32_t)sizeof(staticLengths)) {
411         utf8Lengths=staticLengths;
412     } else {
413         utf8Lengths=(int32_t *)uprv_malloc(allocSize);
414         if(utf8Lengths==NULL) {
415             maxLength16=maxLength8=0;  // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
416             return;  // Out of memory.
417         }
418     }
419 
420     spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
421     utf8=spanLengths+stringsLength*4;
422     uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
423 }
424 
~UnicodeSetStringSpan()425 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
426     if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
427         delete pSpanNotSet;
428     }
429     if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
430         uprv_free(utf8Lengths);
431     }
432 }
433 
addToSpanNotSet(UChar32 c)434 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
435     if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
436         if(spanSet.contains(c)) {
437             return;  // Nothing to do.
438         }
439         UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
440         if(newSet==NULL) {
441             return;  // Out of memory.
442         } else {
443             pSpanNotSet=newSet;
444         }
445     }
446     pSpanNotSet->add(c);
447 }
448 
449 // Compare strings without any argument checks. Requires length>0.
450 static inline UBool
matches16(const UChar * s,const UChar * t,int32_t length)451 matches16(const UChar *s, const UChar *t, int32_t length) {
452     do {
453         if(*s++!=*t++) {
454             return FALSE;
455         }
456     } while(--length>0);
457     return TRUE;
458 }
459 
460 static inline UBool
matches8(const uint8_t * s,const uint8_t * t,int32_t length)461 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
462     do {
463         if(*s++!=*t++) {
464             return FALSE;
465         }
466     } while(--length>0);
467     return TRUE;
468 }
469 
470 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
471 // at code point boundaries.
472 // That is, each edge of a match must not be in the middle of a surrogate pair.
473 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UChar * t,int32_t length)474 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
475     s+=start;
476     limit-=start;
477     return matches16(s, t, length) &&
478            !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
479            !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
480 }
481 
482 // Does the set contain the next code point?
483 // If so, return its length; otherwise return its negative length.
484 static inline int32_t
spanOne(const UnicodeSet & set,const UChar * s,int32_t length)485 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
486     UChar c=*s, c2;
487     if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
488         return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
489     }
490     return set.contains(c) ? 1 : -1;
491 }
492 
493 static inline int32_t
spanOneBack(const UnicodeSet & set,const UChar * s,int32_t length)494 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
495     UChar c=s[length-1], c2;
496     if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
497         return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
498     }
499     return set.contains(c) ? 1 : -1;
500 }
501 
502 static inline int32_t
spanOneUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)503 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
504     UChar32 c=*s;
505     if(U8_IS_SINGLE(c)) {
506         return set.contains(c) ? 1 : -1;
507     }
508     // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
509     int32_t i=0;
510     U8_NEXT_OR_FFFD(s, i, length, c);
511     return set.contains(c) ? i : -i;
512 }
513 
514 static inline int32_t
spanOneBackUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)515 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
516     UChar32 c=s[length-1];
517     if(U8_IS_SINGLE(c)) {
518         return set.contains(c) ? 1 : -1;
519     }
520     int32_t i=length-1;
521     c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
522     length-=i;
523     return set.contains(c) ? length : -length;
524 }
525 
526 /*
527  * Note: In span() when spanLength==0 (after a string match, or at the beginning
528  * after an empty code point span) and in spanNot() and spanNotUTF8(),
529  * string matching could use a binary search
530  * because all string matches are done from the same start index.
531  *
532  * For UTF-8, this would require a comparison function that returns UTF-16 order.
533  *
534  * This optimization should not be necessary for normal UnicodeSets because
535  * most sets have no strings, and most sets with strings have
536  * very few very short strings.
537  * For cases with many strings, it might be better to use a different API
538  * and implementation with a DFA (state machine).
539  */
540 
541 /*
542  * Algorithm for span(USET_SPAN_CONTAINED)
543  *
544  * Theoretical algorithm:
545  * - Iterate through the string, and at each code point boundary:
546  *   + If the code point there is in the set, then remember to continue after it.
547  *   + If a set string matches at the current position, then remember to continue after it.
548  *   + Either recursively span for each code point or string match,
549  *     or recursively span for all but the shortest one and
550  *     iteratively continue the span with the shortest local match.
551  *   + Remember the longest recursive span (the farthest end point).
552  *   + If there is no match at the current position, neither for the code point there
553  *     nor for any set string, then stop and return the longest recursive span length.
554  *
555  * Optimized implementation:
556  *
557  * (We assume that most sets will have very few very short strings.
558  * A span using a string-less set is extremely fast.)
559  *
560  * Create and cache a spanSet which contains all of the single code points
561  * of the original set but none of its strings.
562  *
563  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
564  * - Loop:
565  *   + Try to match each set string at the end of the spanLength.
566  *     ~ Set strings that start with set-contained code points must be matched
567  *       with a partial overlap because the recursive algorithm would have tried
568  *       to match them at every position.
569  *     ~ Set strings that entirely consist of set-contained code points
570  *       are irrelevant for span(USET_SPAN_CONTAINED) because the
571  *       recursive algorithm would continue after them anyway
572  *       and find the longest recursive match from their end.
573  *     ~ Rather than recursing, note each end point of a set string match.
574  *   + If no set string matched after spanSet.span(), then return
575  *     with where the spanSet.span() ended.
576  *   + If at least one set string matched after spanSet.span(), then
577  *     pop the shortest string match end point and continue
578  *     the loop, trying to match all set strings from there.
579  *   + If at least one more set string matched after a previous string match,
580  *     then test if the code point after the previous string match is also
581  *     contained in the set.
582  *     Continue the loop with the shortest end point of either this code point
583  *     or a matching set string.
584  *   + If no more set string matched after a previous string match,
585  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
586  *     Stop if spanLength==0, otherwise continue the loop.
587  *
588  * By noting each end point of a set string match,
589  * the function visits each string position at most once and finishes
590  * in linear time.
591  *
592  * The recursive algorithm may visit the same string position many times
593  * if multiple paths lead to it and finishes in exponential time.
594  */
595 
596 /*
597  * Algorithm for span(USET_SPAN_SIMPLE)
598  *
599  * Theoretical algorithm:
600  * - Iterate through the string, and at each code point boundary:
601  *   + If the code point there is in the set, then remember to continue after it.
602  *   + If a set string matches at the current position, then remember to continue after it.
603  *   + Continue from the farthest match position and ignore all others.
604  *   + If there is no match at the current position,
605  *     then stop and return the current position.
606  *
607  * Optimized implementation:
608  *
609  * (Same assumption and spanSet as above.)
610  *
611  * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
612  * - Loop:
613  *   + Try to match each set string at the end of the spanLength.
614  *     ~ Set strings that start with set-contained code points must be matched
615  *       with a partial overlap because the standard algorithm would have tried
616  *       to match them earlier.
617  *     ~ Set strings that entirely consist of set-contained code points
618  *       must be matched with a full overlap because the longest-match algorithm
619  *       would hide set string matches that end earlier.
620  *       Such set strings need not be matched earlier inside the code point span
621  *       because the standard algorithm would then have continued after
622  *       the set string match anyway.
623  *     ~ Remember the longest set string match (farthest end point) from the earliest
624  *       starting point.
625  *   + If no set string matched after spanSet.span(), then return
626  *     with where the spanSet.span() ended.
627  *   + If at least one set string matched, then continue the loop after the
628  *     longest match from the earliest position.
629  *   + If no more set string matched after a previous string match,
630  *     then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
631  *     Stop if spanLength==0, otherwise continue the loop.
632  */
633 
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const634 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
635     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
636         return spanNot(s, length);
637     }
638     int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
639     if(spanLength==length) {
640         return length;
641     }
642 
643     // Consider strings; they may overlap with the span.
644     OffsetList offsets;
645     if(spanCondition==USET_SPAN_CONTAINED) {
646         // Use offset list to try all possibilities.
647         offsets.setMaxLength(maxLength16);
648     }
649     int32_t pos=spanLength, rest=length-pos;
650     int32_t i, stringsLength=strings.size();
651     for(;;) {
652         if(spanCondition==USET_SPAN_CONTAINED) {
653             for(i=0; i<stringsLength; ++i) {
654                 int32_t overlap=spanLengths[i];
655                 if(overlap==ALL_CP_CONTAINED) {
656                     continue;  // Irrelevant string.
657                 }
658                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
659                 const UChar *s16=string.getBuffer();
660                 int32_t length16=string.length();
661 
662                 // Try to match this string at pos-overlap..pos.
663                 if(overlap>=LONG_SPAN) {
664                     overlap=length16;
665                     // While contained: No point matching fully inside the code point span.
666                     U16_BACK_1(s16, 0, overlap);  // Length of the string minus the last code point.
667                 }
668                 if(overlap>spanLength) {
669                     overlap=spanLength;
670                 }
671                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
672                 for(;;) {
673                     if(inc>rest) {
674                         break;
675                     }
676                     // Try to match if the increment is not listed already.
677                     if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
678                         if(inc==rest) {
679                             return length;  // Reached the end of the string.
680                         }
681                         offsets.addOffset(inc);
682                     }
683                     if(overlap==0) {
684                         break;
685                     }
686                     --overlap;
687                     ++inc;
688                 }
689             }
690         } else /* USET_SPAN_SIMPLE */ {
691             int32_t maxInc=0, maxOverlap=0;
692             for(i=0; i<stringsLength; ++i) {
693                 int32_t overlap=spanLengths[i];
694                 // For longest match, we do need to try to match even an all-contained string
695                 // to find the match from the earliest start.
696 
697                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
698                 const UChar *s16=string.getBuffer();
699                 int32_t length16=string.length();
700 
701                 // Try to match this string at pos-overlap..pos.
702                 if(overlap>=LONG_SPAN) {
703                     overlap=length16;
704                     // Longest match: Need to match fully inside the code point span
705                     // to find the match from the earliest start.
706                 }
707                 if(overlap>spanLength) {
708                     overlap=spanLength;
709                 }
710                 int32_t inc=length16-overlap;  // Keep overlap+inc==length16.
711                 for(;;) {
712                     if(inc>rest || overlap<maxOverlap) {
713                         break;
714                     }
715                     // Try to match if the string is longer or starts earlier.
716                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
717                         matches16CPB(s, pos-overlap, length, s16, length16)
718                     ) {
719                         maxInc=inc;  // Longest match from earliest start.
720                         maxOverlap=overlap;
721                         break;
722                     }
723                     --overlap;
724                     ++inc;
725                 }
726             }
727 
728             if(maxInc!=0 || maxOverlap!=0) {
729                 // Longest-match algorithm, and there was a string match.
730                 // Simply continue after it.
731                 pos+=maxInc;
732                 rest-=maxInc;
733                 if(rest==0) {
734                     return length;  // Reached the end of the string.
735                 }
736                 spanLength=0;  // Match strings from after a string match.
737                 continue;
738             }
739         }
740         // Finished trying to match all strings at pos.
741 
742         if(spanLength!=0 || pos==0) {
743             // The position is after an unlimited code point span (spanLength!=0),
744             // not after a string match.
745             // The only position where spanLength==0 after a span is pos==0.
746             // Otherwise, an unlimited code point span is only tried again when no
747             // strings match, and if such a non-initial span fails we stop.
748             if(offsets.isEmpty()) {
749                 return pos;  // No strings matched after a span.
750             }
751             // Match strings from after the next string match.
752         } else {
753             // The position is after a string match (or a single code point).
754             if(offsets.isEmpty()) {
755                 // No more strings matched after a previous string match.
756                 // Try another code point span from after the last string match.
757                 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
758                 if( spanLength==rest || // Reached the end of the string, or
759                     spanLength==0       // neither strings nor span progressed.
760                 ) {
761                     return pos+spanLength;
762                 }
763                 pos+=spanLength;
764                 rest-=spanLength;
765                 continue;  // spanLength>0: Match strings from after a span.
766             } else {
767                 // Try to match only one code point from after a string match if some
768                 // string matched beyond it, so that we try all possible positions
769                 // and don't overshoot.
770                 spanLength=spanOne(spanSet, s+pos, rest);
771                 if(spanLength>0) {
772                     if(spanLength==rest) {
773                         return length;  // Reached the end of the string.
774                     }
775                     // Match strings after this code point.
776                     // There cannot be any increments below it because UnicodeSet strings
777                     // contain multiple code points.
778                     pos+=spanLength;
779                     rest-=spanLength;
780                     offsets.shift(spanLength);
781                     spanLength=0;
782                     continue;  // Match strings from after a single code point.
783                 }
784                 // Match strings from after the next string match.
785             }
786         }
787         int32_t minOffset=offsets.popMinimum();
788         pos+=minOffset;
789         rest-=minOffset;
790         spanLength=0;  // Match strings from after a string match.
791     }
792 }
793 
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const794 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
795     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
796         return spanNotBack(s, length);
797     }
798     int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
799     if(pos==0) {
800         return 0;
801     }
802     int32_t spanLength=length-pos;
803 
804     // Consider strings; they may overlap with the span.
805     OffsetList offsets;
806     if(spanCondition==USET_SPAN_CONTAINED) {
807         // Use offset list to try all possibilities.
808         offsets.setMaxLength(maxLength16);
809     }
810     int32_t i, stringsLength=strings.size();
811     uint8_t *spanBackLengths=spanLengths;
812     if(all) {
813         spanBackLengths+=stringsLength;
814     }
815     for(;;) {
816         if(spanCondition==USET_SPAN_CONTAINED) {
817             for(i=0; i<stringsLength; ++i) {
818                 int32_t overlap=spanBackLengths[i];
819                 if(overlap==ALL_CP_CONTAINED) {
820                     continue;  // Irrelevant string.
821                 }
822                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
823                 const UChar *s16=string.getBuffer();
824                 int32_t length16=string.length();
825 
826                 // Try to match this string at pos-(length16-overlap)..pos-length16.
827                 if(overlap>=LONG_SPAN) {
828                     overlap=length16;
829                     // While contained: No point matching fully inside the code point span.
830                     int32_t len1=0;
831                     U16_FWD_1(s16, len1, overlap);
832                     overlap-=len1;  // Length of the string minus the first code point.
833                 }
834                 if(overlap>spanLength) {
835                     overlap=spanLength;
836                 }
837                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
838                 for(;;) {
839                     if(dec>pos) {
840                         break;
841                     }
842                     // Try to match if the decrement is not listed already.
843                     if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
844                         if(dec==pos) {
845                             return 0;  // Reached the start of the string.
846                         }
847                         offsets.addOffset(dec);
848                     }
849                     if(overlap==0) {
850                         break;
851                     }
852                     --overlap;
853                     ++dec;
854                 }
855             }
856         } else /* USET_SPAN_SIMPLE */ {
857             int32_t maxDec=0, maxOverlap=0;
858             for(i=0; i<stringsLength; ++i) {
859                 int32_t overlap=spanBackLengths[i];
860                 // For longest match, we do need to try to match even an all-contained string
861                 // to find the match from the latest end.
862 
863                 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
864                 const UChar *s16=string.getBuffer();
865                 int32_t length16=string.length();
866 
867                 // Try to match this string at pos-(length16-overlap)..pos-length16.
868                 if(overlap>=LONG_SPAN) {
869                     overlap=length16;
870                     // Longest match: Need to match fully inside the code point span
871                     // to find the match from the latest end.
872                 }
873                 if(overlap>spanLength) {
874                     overlap=spanLength;
875                 }
876                 int32_t dec=length16-overlap;  // Keep dec+overlap==length16.
877                 for(;;) {
878                     if(dec>pos || overlap<maxOverlap) {
879                         break;
880                     }
881                     // Try to match if the string is longer or ends later.
882                     if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
883                         matches16CPB(s, pos-dec, length, s16, length16)
884                     ) {
885                         maxDec=dec;  // Longest match from latest end.
886                         maxOverlap=overlap;
887                         break;
888                     }
889                     --overlap;
890                     ++dec;
891                 }
892             }
893 
894             if(maxDec!=0 || maxOverlap!=0) {
895                 // Longest-match algorithm, and there was a string match.
896                 // Simply continue before it.
897                 pos-=maxDec;
898                 if(pos==0) {
899                     return 0;  // Reached the start of the string.
900                 }
901                 spanLength=0;  // Match strings from before a string match.
902                 continue;
903             }
904         }
905         // Finished trying to match all strings at pos.
906 
907         if(spanLength!=0 || pos==length) {
908             // The position is before an unlimited code point span (spanLength!=0),
909             // not before a string match.
910             // The only position where spanLength==0 before a span is pos==length.
911             // Otherwise, an unlimited code point span is only tried again when no
912             // strings match, and if such a non-initial span fails we stop.
913             if(offsets.isEmpty()) {
914                 return pos;  // No strings matched before a span.
915             }
916             // Match strings from before the next string match.
917         } else {
918             // The position is before a string match (or a single code point).
919             if(offsets.isEmpty()) {
920                 // No more strings matched before a previous string match.
921                 // Try another code point span from before the last string match.
922                 int32_t oldPos=pos;
923                 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
924                 spanLength=oldPos-pos;
925                 if( pos==0 ||           // Reached the start of the string, or
926                     spanLength==0       // neither strings nor span progressed.
927                 ) {
928                     return pos;
929                 }
930                 continue;  // spanLength>0: Match strings from before a span.
931             } else {
932                 // Try to match only one code point from before a string match if some
933                 // string matched beyond it, so that we try all possible positions
934                 // and don't overshoot.
935                 spanLength=spanOneBack(spanSet, s, pos);
936                 if(spanLength>0) {
937                     if(spanLength==pos) {
938                         return 0;  // Reached the start of the string.
939                     }
940                     // Match strings before this code point.
941                     // There cannot be any decrements below it because UnicodeSet strings
942                     // contain multiple code points.
943                     pos-=spanLength;
944                     offsets.shift(spanLength);
945                     spanLength=0;
946                     continue;  // Match strings from before a single code point.
947                 }
948                 // Match strings from before the next string match.
949             }
950         }
951         pos-=offsets.popMinimum();
952         spanLength=0;  // Match strings from before a string match.
953     }
954 }
955 
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const956 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
957     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
958         return spanNotUTF8(s, length);
959     }
960     int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
961     if(spanLength==length) {
962         return length;
963     }
964 
965     // Consider strings; they may overlap with the span.
966     OffsetList offsets;
967     if(spanCondition==USET_SPAN_CONTAINED) {
968         // Use offset list to try all possibilities.
969         offsets.setMaxLength(maxLength8);
970     }
971     int32_t pos=spanLength, rest=length-pos;
972     int32_t i, stringsLength=strings.size();
973     uint8_t *spanUTF8Lengths=spanLengths;
974     if(all) {
975         spanUTF8Lengths+=2*stringsLength;
976     }
977     for(;;) {
978         const uint8_t *s8=utf8;
979         int32_t length8;
980         if(spanCondition==USET_SPAN_CONTAINED) {
981             for(i=0; i<stringsLength; ++i) {
982                 length8=utf8Lengths[i];
983                 if(length8==0) {
984                     continue;  // String not representable in UTF-8.
985                 }
986                 int32_t overlap=spanUTF8Lengths[i];
987                 if(overlap==ALL_CP_CONTAINED) {
988                     s8+=length8;
989                     continue;  // Irrelevant string.
990                 }
991 
992                 // Try to match this string at pos-overlap..pos.
993                 if(overlap>=LONG_SPAN) {
994                     overlap=length8;
995                     // While contained: No point matching fully inside the code point span.
996                     U8_BACK_1(s8, 0, overlap);  // Length of the string minus the last code point.
997                 }
998                 if(overlap>spanLength) {
999                     overlap=spanLength;
1000                 }
1001                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1002                 for(;;) {
1003                     if(inc>rest) {
1004                         break;
1005                     }
1006                     // Try to match if the increment is not listed already.
1007                     // Match at code point boundaries. (The UTF-8 strings were converted
1008                     // from UTF-16 and are guaranteed to be well-formed.)
1009                     if(!U8_IS_TRAIL(s[pos-overlap]) &&
1010                             !offsets.containsOffset(inc) &&
1011                             matches8(s+pos-overlap, s8, length8)) {
1012                         if(inc==rest) {
1013                             return length;  // Reached the end of the string.
1014                         }
1015                         offsets.addOffset(inc);
1016                     }
1017                     if(overlap==0) {
1018                         break;
1019                     }
1020                     --overlap;
1021                     ++inc;
1022                 }
1023                 s8+=length8;
1024             }
1025         } else /* USET_SPAN_SIMPLE */ {
1026             int32_t maxInc=0, maxOverlap=0;
1027             for(i=0; i<stringsLength; ++i) {
1028                 length8=utf8Lengths[i];
1029                 if(length8==0) {
1030                     continue;  // String not representable in UTF-8.
1031                 }
1032                 int32_t overlap=spanUTF8Lengths[i];
1033                 // For longest match, we do need to try to match even an all-contained string
1034                 // to find the match from the earliest start.
1035 
1036                 // Try to match this string at pos-overlap..pos.
1037                 if(overlap>=LONG_SPAN) {
1038                     overlap=length8;
1039                     // Longest match: Need to match fully inside the code point span
1040                     // to find the match from the earliest start.
1041                 }
1042                 if(overlap>spanLength) {
1043                     overlap=spanLength;
1044                 }
1045                 int32_t inc=length8-overlap;  // Keep overlap+inc==length8.
1046                 for(;;) {
1047                     if(inc>rest || overlap<maxOverlap) {
1048                         break;
1049                     }
1050                     // Try to match if the string is longer or starts earlier.
1051                     // Match at code point boundaries. (The UTF-8 strings were converted
1052                     // from UTF-16 and are guaranteed to be well-formed.)
1053                     if(!U8_IS_TRAIL(s[pos-overlap]) &&
1054                             (overlap>maxOverlap ||
1055                                 /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1056                             matches8(s+pos-overlap, s8, length8)) {
1057                         maxInc=inc;  // Longest match from earliest start.
1058                         maxOverlap=overlap;
1059                         break;
1060                     }
1061                     --overlap;
1062                     ++inc;
1063                 }
1064                 s8+=length8;
1065             }
1066 
1067             if(maxInc!=0 || maxOverlap!=0) {
1068                 // Longest-match algorithm, and there was a string match.
1069                 // Simply continue after it.
1070                 pos+=maxInc;
1071                 rest-=maxInc;
1072                 if(rest==0) {
1073                     return length;  // Reached the end of the string.
1074                 }
1075                 spanLength=0;  // Match strings from after a string match.
1076                 continue;
1077             }
1078         }
1079         // Finished trying to match all strings at pos.
1080 
1081         if(spanLength!=0 || pos==0) {
1082             // The position is after an unlimited code point span (spanLength!=0),
1083             // not after a string match.
1084             // The only position where spanLength==0 after a span is pos==0.
1085             // Otherwise, an unlimited code point span is only tried again when no
1086             // strings match, and if such a non-initial span fails we stop.
1087             if(offsets.isEmpty()) {
1088                 return pos;  // No strings matched after a span.
1089             }
1090             // Match strings from after the next string match.
1091         } else {
1092             // The position is after a string match (or a single code point).
1093             if(offsets.isEmpty()) {
1094                 // No more strings matched after a previous string match.
1095                 // Try another code point span from after the last string match.
1096                 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1097                 if( spanLength==rest || // Reached the end of the string, or
1098                     spanLength==0       // neither strings nor span progressed.
1099                 ) {
1100                     return pos+spanLength;
1101                 }
1102                 pos+=spanLength;
1103                 rest-=spanLength;
1104                 continue;  // spanLength>0: Match strings from after a span.
1105             } else {
1106                 // Try to match only one code point from after a string match if some
1107                 // string matched beyond it, so that we try all possible positions
1108                 // and don't overshoot.
1109                 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1110                 if(spanLength>0) {
1111                     if(spanLength==rest) {
1112                         return length;  // Reached the end of the string.
1113                     }
1114                     // Match strings after this code point.
1115                     // There cannot be any increments below it because UnicodeSet strings
1116                     // contain multiple code points.
1117                     pos+=spanLength;
1118                     rest-=spanLength;
1119                     offsets.shift(spanLength);
1120                     spanLength=0;
1121                     continue;  // Match strings from after a single code point.
1122                 }
1123                 // Match strings from after the next string match.
1124             }
1125         }
1126         int32_t minOffset=offsets.popMinimum();
1127         pos+=minOffset;
1128         rest-=minOffset;
1129         spanLength=0;  // Match strings from after a string match.
1130     }
1131 }
1132 
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const1133 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1134     if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1135         return spanNotBackUTF8(s, length);
1136     }
1137     int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1138     if(pos==0) {
1139         return 0;
1140     }
1141     int32_t spanLength=length-pos;
1142 
1143     // Consider strings; they may overlap with the span.
1144     OffsetList offsets;
1145     if(spanCondition==USET_SPAN_CONTAINED) {
1146         // Use offset list to try all possibilities.
1147         offsets.setMaxLength(maxLength8);
1148     }
1149     int32_t i, stringsLength=strings.size();
1150     uint8_t *spanBackUTF8Lengths=spanLengths;
1151     if(all) {
1152         spanBackUTF8Lengths+=3*stringsLength;
1153     }
1154     for(;;) {
1155         const uint8_t *s8=utf8;
1156         int32_t length8;
1157         if(spanCondition==USET_SPAN_CONTAINED) {
1158             for(i=0; i<stringsLength; ++i) {
1159                 length8=utf8Lengths[i];
1160                 if(length8==0) {
1161                     continue;  // String not representable in UTF-8.
1162                 }
1163                 int32_t overlap=spanBackUTF8Lengths[i];
1164                 if(overlap==ALL_CP_CONTAINED) {
1165                     s8+=length8;
1166                     continue;  // Irrelevant string.
1167                 }
1168 
1169                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1170                 if(overlap>=LONG_SPAN) {
1171                     overlap=length8;
1172                     // While contained: No point matching fully inside the code point span.
1173                     int32_t len1=0;
1174                     U8_FWD_1(s8, len1, overlap);
1175                     overlap-=len1;  // Length of the string minus the first code point.
1176                 }
1177                 if(overlap>spanLength) {
1178                     overlap=spanLength;
1179                 }
1180                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1181                 for(;;) {
1182                     if(dec>pos) {
1183                         break;
1184                     }
1185                     // Try to match if the decrement is not listed already.
1186                     // Match at code point boundaries. (The UTF-8 strings were converted
1187                     // from UTF-16 and are guaranteed to be well-formed.)
1188                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1189                         !offsets.containsOffset(dec) &&
1190                         matches8(s+pos-dec, s8, length8)
1191                     ) {
1192                         if(dec==pos) {
1193                             return 0;  // Reached the start of the string.
1194                         }
1195                         offsets.addOffset(dec);
1196                     }
1197                     if(overlap==0) {
1198                         break;
1199                     }
1200                     --overlap;
1201                     ++dec;
1202                 }
1203                 s8+=length8;
1204             }
1205         } else /* USET_SPAN_SIMPLE */ {
1206             int32_t maxDec=0, maxOverlap=0;
1207             for(i=0; i<stringsLength; ++i) {
1208                 length8=utf8Lengths[i];
1209                 if(length8==0) {
1210                     continue;  // String not representable in UTF-8.
1211                 }
1212                 int32_t overlap=spanBackUTF8Lengths[i];
1213                 // For longest match, we do need to try to match even an all-contained string
1214                 // to find the match from the latest end.
1215 
1216                 // Try to match this string at pos-(length8-overlap)..pos-length8.
1217                 if(overlap>=LONG_SPAN) {
1218                     overlap=length8;
1219                     // Longest match: Need to match fully inside the code point span
1220                     // to find the match from the latest end.
1221                 }
1222                 if(overlap>spanLength) {
1223                     overlap=spanLength;
1224                 }
1225                 int32_t dec=length8-overlap;  // Keep dec+overlap==length8.
1226                 for(;;) {
1227                     if(dec>pos || overlap<maxOverlap) {
1228                         break;
1229                     }
1230                     // Try to match if the string is longer or ends later.
1231                     // Match at code point boundaries. (The UTF-8 strings were converted
1232                     // from UTF-16 and are guaranteed to be well-formed.)
1233                     if( !U8_IS_TRAIL(s[pos-dec]) &&
1234                         (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1235                         matches8(s+pos-dec, s8, length8)
1236                     ) {
1237                         maxDec=dec;  // Longest match from latest end.
1238                         maxOverlap=overlap;
1239                         break;
1240                     }
1241                     --overlap;
1242                     ++dec;
1243                 }
1244                 s8+=length8;
1245             }
1246 
1247             if(maxDec!=0 || maxOverlap!=0) {
1248                 // Longest-match algorithm, and there was a string match.
1249                 // Simply continue before it.
1250                 pos-=maxDec;
1251                 if(pos==0) {
1252                     return 0;  // Reached the start of the string.
1253                 }
1254                 spanLength=0;  // Match strings from before a string match.
1255                 continue;
1256             }
1257         }
1258         // Finished trying to match all strings at pos.
1259 
1260         if(spanLength!=0 || pos==length) {
1261             // The position is before an unlimited code point span (spanLength!=0),
1262             // not before a string match.
1263             // The only position where spanLength==0 before a span is pos==length.
1264             // Otherwise, an unlimited code point span is only tried again when no
1265             // strings match, and if such a non-initial span fails we stop.
1266             if(offsets.isEmpty()) {
1267                 return pos;  // No strings matched before a span.
1268             }
1269             // Match strings from before the next string match.
1270         } else {
1271             // The position is before a string match (or a single code point).
1272             if(offsets.isEmpty()) {
1273                 // No more strings matched before a previous string match.
1274                 // Try another code point span from before the last string match.
1275                 int32_t oldPos=pos;
1276                 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1277                 spanLength=oldPos-pos;
1278                 if( pos==0 ||           // Reached the start of the string, or
1279                     spanLength==0       // neither strings nor span progressed.
1280                 ) {
1281                     return pos;
1282                 }
1283                 continue;  // spanLength>0: Match strings from before a span.
1284             } else {
1285                 // Try to match only one code point from before a string match if some
1286                 // string matched beyond it, so that we try all possible positions
1287                 // and don't overshoot.
1288                 spanLength=spanOneBackUTF8(spanSet, s, pos);
1289                 if(spanLength>0) {
1290                     if(spanLength==pos) {
1291                         return 0;  // Reached the start of the string.
1292                     }
1293                     // Match strings before this code point.
1294                     // There cannot be any decrements below it because UnicodeSet strings
1295                     // contain multiple code points.
1296                     pos-=spanLength;
1297                     offsets.shift(spanLength);
1298                     spanLength=0;
1299                     continue;  // Match strings from before a single code point.
1300                 }
1301                 // Match strings from before the next string match.
1302             }
1303         }
1304         pos-=offsets.popMinimum();
1305         spanLength=0;  // Match strings from before a string match.
1306     }
1307 }
1308 
1309 /*
1310  * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1311  *
1312  * Theoretical algorithm:
1313  * - Iterate through the string, and at each code point boundary:
1314  *   + If the code point there is in the set, then return with the current position.
1315  *   + If a set string matches at the current position, then return with the current position.
1316  *
1317  * Optimized implementation:
1318  *
1319  * (Same assumption as for span() above.)
1320  *
1321  * Create and cache a spanNotSet which contains all of the single code points
1322  * of the original set but none of its strings.
1323  * For each set string add its initial code point to the spanNotSet.
1324  * (Also add its final code point for spanNotBack().)
1325  *
1326  * - Loop:
1327  *   + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1328  *   + If the current code point is in the original set, then
1329  *     return the current position.
1330  *   + If any set string matches at the current position, then
1331  *     return the current position.
1332  *   + If there is no match at the current position, neither for the code point there
1333  *     nor for any set string, then skip this code point and continue the loop.
1334  *     This happens for set-string-initial code points that were added to spanNotSet
1335  *     when there is not actually a match for such a set string.
1336  */
1337 
spanNot(const UChar * s,int32_t length) const1338 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1339     int32_t pos=0, rest=length;
1340     int32_t i, stringsLength=strings.size();
1341     do {
1342         // Span until we find a code point from the set,
1343         // or a code point that starts or ends some string.
1344         i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1345         if(i==rest) {
1346             return length;  // Reached the end of the string.
1347         }
1348         pos+=i;
1349         rest-=i;
1350 
1351         // Check whether the current code point is in the original set,
1352         // without the string starts and ends.
1353         int32_t cpLength=spanOne(spanSet, s+pos, rest);
1354         if(cpLength>0) {
1355             return pos;  // There is a set element at pos.
1356         }
1357 
1358         // Try to match the strings at pos.
1359         for(i=0; i<stringsLength; ++i) {
1360             if(spanLengths[i]==ALL_CP_CONTAINED) {
1361                 continue;  // Irrelevant string.
1362             }
1363             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1364             const UChar *s16=string.getBuffer();
1365             int32_t length16=string.length();
1366             if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1367                 return pos;  // There is a set element at pos.
1368             }
1369         }
1370 
1371         // The span(while not contained) ended on a string start/end which is
1372         // not in the original set. Skip this code point and continue.
1373         // cpLength<0
1374         pos-=cpLength;
1375         rest+=cpLength;
1376     } while(rest!=0);
1377     return length;  // Reached the end of the string.
1378 }
1379 
spanNotBack(const UChar * s,int32_t length) const1380 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1381     int32_t pos=length;
1382     int32_t i, stringsLength=strings.size();
1383     do {
1384         // Span until we find a code point from the set,
1385         // or a code point that starts or ends some string.
1386         pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1387         if(pos==0) {
1388             return 0;  // Reached the start of the string.
1389         }
1390 
1391         // Check whether the current code point is in the original set,
1392         // without the string starts and ends.
1393         int32_t cpLength=spanOneBack(spanSet, s, pos);
1394         if(cpLength>0) {
1395             return pos;  // There is a set element at pos.
1396         }
1397 
1398         // Try to match the strings at pos.
1399         for(i=0; i<stringsLength; ++i) {
1400             // Use spanLengths rather than a spanBackLengths pointer because
1401             // it is easier and we only need to know whether the string is irrelevant
1402             // which is the same in either array.
1403             if(spanLengths[i]==ALL_CP_CONTAINED) {
1404                 continue;  // Irrelevant string.
1405             }
1406             const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1407             const UChar *s16=string.getBuffer();
1408             int32_t length16=string.length();
1409             if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1410                 return pos;  // There is a set element at pos.
1411             }
1412         }
1413 
1414         // The span(while not contained) ended on a string start/end which is
1415         // not in the original set. Skip this code point and continue.
1416         // cpLength<0
1417         pos+=cpLength;
1418     } while(pos!=0);
1419     return 0;  // Reached the start of the string.
1420 }
1421 
spanNotUTF8(const uint8_t * s,int32_t length) const1422 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1423     int32_t pos=0, rest=length;
1424     int32_t i, stringsLength=strings.size();
1425     uint8_t *spanUTF8Lengths=spanLengths;
1426     if(all) {
1427         spanUTF8Lengths+=2*stringsLength;
1428     }
1429     do {
1430         // Span until we find a code point from the set,
1431         // or a code point that starts or ends some string.
1432         i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1433         if(i==rest) {
1434             return length;  // Reached the end of the string.
1435         }
1436         pos+=i;
1437         rest-=i;
1438 
1439         // Check whether the current code point is in the original set,
1440         // without the string starts and ends.
1441         int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1442         if(cpLength>0) {
1443             return pos;  // There is a set element at pos.
1444         }
1445 
1446         // Try to match the strings at pos.
1447         const uint8_t *s8=utf8;
1448         int32_t length8;
1449         for(i=0; i<stringsLength; ++i) {
1450             length8=utf8Lengths[i];
1451             // ALL_CP_CONTAINED: Irrelevant string.
1452             if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1453                 return pos;  // There is a set element at pos.
1454             }
1455             s8+=length8;
1456         }
1457 
1458         // The span(while not contained) ended on a string start/end which is
1459         // not in the original set. Skip this code point and continue.
1460         // cpLength<0
1461         pos-=cpLength;
1462         rest+=cpLength;
1463     } while(rest!=0);
1464     return length;  // Reached the end of the string.
1465 }
1466 
spanNotBackUTF8(const uint8_t * s,int32_t length) const1467 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1468     int32_t pos=length;
1469     int32_t i, stringsLength=strings.size();
1470     uint8_t *spanBackUTF8Lengths=spanLengths;
1471     if(all) {
1472         spanBackUTF8Lengths+=3*stringsLength;
1473     }
1474     do {
1475         // Span until we find a code point from the set,
1476         // or a code point that starts or ends some string.
1477         pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1478         if(pos==0) {
1479             return 0;  // Reached the start of the string.
1480         }
1481 
1482         // Check whether the current code point is in the original set,
1483         // without the string starts and ends.
1484         int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1485         if(cpLength>0) {
1486             return pos;  // There is a set element at pos.
1487         }
1488 
1489         // Try to match the strings at pos.
1490         const uint8_t *s8=utf8;
1491         int32_t length8;
1492         for(i=0; i<stringsLength; ++i) {
1493             length8=utf8Lengths[i];
1494             // ALL_CP_CONTAINED: Irrelevant string.
1495             if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1496                 return pos;  // There is a set element at pos.
1497             }
1498             s8+=length8;
1499         }
1500 
1501         // The span(while not contained) ended on a string start/end which is
1502         // not in the original set. Skip this code point and continue.
1503         // cpLength<0
1504         pos+=cpLength;
1505     } while(pos!=0);
1506     return 0;  // Reached the start of the string.
1507 }
1508 
1509 U_NAMESPACE_END
1510