1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2007-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: unisetspan.cpp
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2007mar01
16 * created by: Markus W. Scherer
17 */
18
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/ustring.h"
22 #include "unicode/utf8.h"
23 #include "unicode/utf16.h"
24 #include "cmemory.h"
25 #include "uvector.h"
26 #include "unisetspan.h"
27
28 U_NAMESPACE_BEGIN
29
30 /*
31 * List of offsets from the current position from where to try matching
32 * a code point or a string.
33 * Store offsets rather than indexes to simplify the code and use the same list
34 * for both increments (in span()) and decrements (in spanBack()).
35 *
36 * Assumption: The maximum offset is limited, and the offsets that are stored
37 * at any one time are relatively dense, that is, there are normally no gaps of
38 * hundreds or thousands of offset values.
39 *
40 * The implementation uses a circular buffer of byte flags,
41 * each indicating whether the corresponding offset is in the list.
42 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43 * physically moving part of the list.
44 *
45 * Note: In principle, the caller should setMaxLength() to the maximum of the
46 * max string length and U16_LENGTH/U8_LENGTH to account for
47 * "long" single code points.
48 * However, this implementation uses at least a staticList with more than
49 * U8_LENGTH entries anyway.
50 *
51 * Note: If maxLength were guaranteed to be no more than 32 or 64,
52 * the list could be stored as bit flags in a single integer.
53 * Rather than handling a circular buffer with a start list index,
54 * the integer would simply be shifted when lower offsets are removed.
55 * UnicodeSet does not have a limit on the lengths of strings.
56 */
57 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
58 public:
OffsetList()59 OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60
~OffsetList()61 ~OffsetList() {
62 if(list!=staticList) {
63 uprv_free(list);
64 }
65 }
66
67 // Call exactly once if the list is to be used.
setMaxLength(int32_t maxLength)68 void setMaxLength(int32_t maxLength) {
69 if(maxLength<=(int32_t)sizeof(staticList)) {
70 capacity=(int32_t)sizeof(staticList);
71 } else {
72 UBool *l=(UBool *)uprv_malloc(maxLength);
73 if(l!=NULL) {
74 list=l;
75 capacity=maxLength;
76 }
77 }
78 uprv_memset(list, 0, capacity);
79 }
80
clear()81 void clear() {
82 uprv_memset(list, 0, capacity);
83 start=length=0;
84 }
85
isEmpty() const86 UBool isEmpty() const {
87 return (UBool)(length==0);
88 }
89
90 // Reduce all stored offsets by delta, used when the current position
91 // moves by delta.
92 // There must not be any offsets lower than delta.
93 // If there is an offset equal to delta, it is removed.
94 // delta=[1..maxLength]
shift(int32_t delta)95 void shift(int32_t delta) {
96 int32_t i=start+delta;
97 if(i>=capacity) {
98 i-=capacity;
99 }
100 if(list[i]) {
101 list[i]=FALSE;
102 --length;
103 }
104 start=i;
105 }
106
107 // Add an offset. The list must not contain it yet.
108 // offset=[1..maxLength]
addOffset(int32_t offset)109 void addOffset(int32_t offset) {
110 int32_t i=start+offset;
111 if(i>=capacity) {
112 i-=capacity;
113 }
114 list[i]=TRUE;
115 ++length;
116 }
117
118 // offset=[1..maxLength]
containsOffset(int32_t offset) const119 UBool containsOffset(int32_t offset) const {
120 int32_t i=start+offset;
121 if(i>=capacity) {
122 i-=capacity;
123 }
124 return list[i];
125 }
126
127 // Find the lowest stored offset from a non-empty list, remove it,
128 // and reduce all other offsets by this minimum.
129 // Returns [1..maxLength].
popMinimum()130 int32_t popMinimum() {
131 // Look for the next offset in list[start+1..capacity-1].
132 int32_t i=start, result;
133 while(++i<capacity) {
134 if(list[i]) {
135 list[i]=FALSE;
136 --length;
137 result=i-start;
138 start=i;
139 return result;
140 }
141 }
142 // i==capacity
143
144 // Wrap around and look for the next offset in list[0..start].
145 // Since the list is not empty, there will be one.
146 result=capacity-start;
147 i=0;
148 while(!list[i]) {
149 ++i;
150 }
151 list[i]=FALSE;
152 --length;
153 start=i;
154 return result+=i;
155 }
156
157 private:
158 UBool *list;
159 int32_t capacity;
160 int32_t length;
161 int32_t start;
162
163 UBool staticList[16];
164 };
165
166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167 static int32_t
getUTF8Length(const UChar * s,int32_t length)168 getUTF8Length(const UChar *s, int32_t length) {
169 UErrorCode errorCode=U_ZERO_ERROR;
170 int32_t length8=0;
171 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
172 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173 return length8;
174 } else {
175 // The string contains an unpaired surrogate.
176 // Ignore this string.
177 return 0;
178 }
179 }
180
181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182 static int32_t
appendUTF8(const UChar * s,int32_t length,uint8_t * t,int32_t capacity)183 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
184 UErrorCode errorCode=U_ZERO_ERROR;
185 int32_t length8=0;
186 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
187 if(U_SUCCESS(errorCode)) {
188 return length8;
189 } else {
190 // The string contains an unpaired surrogate.
191 // Ignore this string.
192 return 0;
193 }
194 }
195
196 static inline uint8_t
makeSpanLengthByte(int32_t spanLength)197 makeSpanLengthByte(int32_t spanLength) {
198 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200 }
201
202 // Construct for all variants of span(), or only for any one variant.
203 // Initialize as little as possible, for single use.
UnicodeSetStringSpan(const UnicodeSet & set,const UVector & setStrings,uint32_t which)204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205 const UVector &setStrings,
206 uint32_t which)
207 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
208 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
209 utf8Length(0),
210 maxLength16(0), maxLength8(0),
211 all((UBool)(which==ALL)) {
212 spanSet.retainAll(set);
213 if(which&NOT_CONTAINED) {
214 // Default to the same sets.
215 // addToSpanNotSet() will create a separate set if necessary.
216 pSpanNotSet=&spanSet;
217 }
218
219 // Determine if the strings even need to be taken into account at all for span() etc.
220 // If any string is relevant, then all strings need to be used for
221 // span(longest match) but only the relevant ones for span(while contained).
222 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226 int32_t stringsLength=strings.size();
227
228 int32_t i, spanLength;
229 UBool someRelevant=FALSE;
230 for(i=0; i<stringsLength; ++i) {
231 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
232 const UChar *s16=string.getBuffer();
233 int32_t length16=string.length();
234 UBool thisRelevant;
235 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
236 if(spanLength<length16) { // Relevant string.
237 someRelevant=thisRelevant=TRUE;
238 } else {
239 thisRelevant=FALSE;
240 }
241 if((which&UTF16) && length16>maxLength16) {
242 maxLength16=length16;
243 }
244 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
245 int32_t length8=getUTF8Length(s16, length16);
246 utf8Length+=length8;
247 if(length8>maxLength8) {
248 maxLength8=length8;
249 }
250 }
251 }
252 if(!someRelevant) {
253 maxLength16=maxLength8=0;
254 return;
255 }
256
257 // Freeze after checking for the need to use strings at all because freezing
258 // a set takes some time and memory which are wasted if there are no relevant strings.
259 if(all) {
260 spanSet.freeze();
261 }
262
263 uint8_t *spanBackLengths;
264 uint8_t *spanUTF8Lengths;
265 uint8_t *spanBackUTF8Lengths;
266
267 // Allocate a block of meta data.
268 int32_t allocSize;
269 if(all) {
270 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
271 allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
272 } else {
273 allocSize=stringsLength; // One set of span lengths.
274 if(which&UTF8) {
275 // UTF-8 lengths and UTF-8 strings.
276 allocSize+=stringsLength*4+utf8Length;
277 }
278 }
279 if(allocSize<=(int32_t)sizeof(staticLengths)) {
280 utf8Lengths=staticLengths;
281 } else {
282 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
283 if(utf8Lengths==NULL) {
284 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
285 return; // Out of memory.
286 }
287 }
288
289 if(all) {
290 // Store span lengths for all span() variants.
291 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
292 spanBackLengths=spanLengths+stringsLength;
293 spanUTF8Lengths=spanBackLengths+stringsLength;
294 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
295 utf8=spanBackUTF8Lengths+stringsLength;
296 } else {
297 // Store span lengths for only one span() variant.
298 if(which&UTF8) {
299 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
300 utf8=spanLengths+stringsLength;
301 } else {
302 spanLengths=(uint8_t *)utf8Lengths;
303 }
304 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
305 }
306
307 // Set the meta data and pSpanNotSet and write the UTF-8 strings.
308 int32_t utf8Count=0; // Count UTF-8 bytes written so far.
309
310 for(i=0; i<stringsLength; ++i) {
311 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
312 const UChar *s16=string.getBuffer();
313 int32_t length16=string.length();
314 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
315 if(spanLength<length16) { // Relevant string.
316 if(which&UTF16) {
317 if(which&CONTAINED) {
318 if(which&FWD) {
319 spanLengths[i]=makeSpanLengthByte(spanLength);
320 }
321 if(which&BACK) {
322 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
323 spanBackLengths[i]=makeSpanLengthByte(spanLength);
324 }
325 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
326 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
327 }
328 }
329 if(which&UTF8) {
330 uint8_t *s8=utf8+utf8Count;
331 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
332 utf8Count+=utf8Lengths[i]=length8;
333 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
334 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
335 } else { // Relevant for UTF-8.
336 if(which&CONTAINED) {
337 if(which&FWD) {
338 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
339 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
340 }
341 if(which&BACK) {
342 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
343 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
344 }
345 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
346 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
347 }
348 }
349 }
350 if(which&NOT_CONTAINED) {
351 // Add string start and end code points to the spanNotSet so that
352 // a span(while not contained) stops before any string.
353 UChar32 c;
354 if(which&FWD) {
355 int32_t len=0;
356 U16_NEXT(s16, len, length16, c);
357 addToSpanNotSet(c);
358 }
359 if(which&BACK) {
360 int32_t len=length16;
361 U16_PREV(s16, 0, len, c);
362 addToSpanNotSet(c);
363 }
364 }
365 } else { // Irrelevant string.
366 if(which&UTF8) {
367 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
368 uint8_t *s8=utf8+utf8Count;
369 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
370 utf8Count+=utf8Lengths[i]=length8;
371 } else {
372 utf8Lengths[i]=0;
373 }
374 }
375 if(all) {
376 spanLengths[i]=spanBackLengths[i]=
377 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
378 (uint8_t)ALL_CP_CONTAINED;
379 } else {
380 // All spanXYZLengths pointers contain the same address.
381 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
382 }
383 }
384 }
385
386 // Finish.
387 if(all) {
388 pSpanNotSet->freeze();
389 }
390 }
391
392 // Copy constructor. Assumes which==ALL for a frozen set.
UnicodeSetStringSpan(const UnicodeSetStringSpan & otherStringSpan,const UVector & newParentSetStrings)393 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
394 const UVector &newParentSetStrings)
395 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
396 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
397 utf8Length(otherStringSpan.utf8Length),
398 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
399 all(TRUE) {
400 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
401 pSpanNotSet=&spanSet;
402 } else {
403 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
404 }
405
406 // Allocate a block of meta data.
407 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
408 int32_t stringsLength=strings.size();
409 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
410 if(allocSize<=(int32_t)sizeof(staticLengths)) {
411 utf8Lengths=staticLengths;
412 } else {
413 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
414 if(utf8Lengths==NULL) {
415 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
416 return; // Out of memory.
417 }
418 }
419
420 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
421 utf8=spanLengths+stringsLength*4;
422 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
423 }
424
~UnicodeSetStringSpan()425 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
426 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
427 delete pSpanNotSet;
428 }
429 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
430 uprv_free(utf8Lengths);
431 }
432 }
433
addToSpanNotSet(UChar32 c)434 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
435 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
436 if(spanSet.contains(c)) {
437 return; // Nothing to do.
438 }
439 UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
440 if(newSet==NULL) {
441 return; // Out of memory.
442 } else {
443 pSpanNotSet=newSet;
444 }
445 }
446 pSpanNotSet->add(c);
447 }
448
449 // Compare strings without any argument checks. Requires length>0.
450 static inline UBool
matches16(const UChar * s,const UChar * t,int32_t length)451 matches16(const UChar *s, const UChar *t, int32_t length) {
452 do {
453 if(*s++!=*t++) {
454 return FALSE;
455 }
456 } while(--length>0);
457 return TRUE;
458 }
459
460 static inline UBool
matches8(const uint8_t * s,const uint8_t * t,int32_t length)461 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
462 do {
463 if(*s++!=*t++) {
464 return FALSE;
465 }
466 } while(--length>0);
467 return TRUE;
468 }
469
470 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
471 // at code point boundaries.
472 // That is, each edge of a match must not be in the middle of a surrogate pair.
473 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UChar * t,int32_t length)474 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
475 s+=start;
476 limit-=start;
477 return matches16(s, t, length) &&
478 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
479 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
480 }
481
482 // Does the set contain the next code point?
483 // If so, return its length; otherwise return its negative length.
484 static inline int32_t
spanOne(const UnicodeSet & set,const UChar * s,int32_t length)485 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
486 UChar c=*s, c2;
487 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
488 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
489 }
490 return set.contains(c) ? 1 : -1;
491 }
492
493 static inline int32_t
spanOneBack(const UnicodeSet & set,const UChar * s,int32_t length)494 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
495 UChar c=s[length-1], c2;
496 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
497 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
498 }
499 return set.contains(c) ? 1 : -1;
500 }
501
502 static inline int32_t
spanOneUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)503 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
504 UChar32 c=*s;
505 if((int8_t)c>=0) {
506 return set.contains(c) ? 1 : -1;
507 }
508 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
509 int32_t i=0;
510 U8_NEXT_OR_FFFD(s, i, length, c);
511 return set.contains(c) ? i : -i;
512 }
513
514 static inline int32_t
spanOneBackUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)515 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
516 UChar32 c=s[length-1];
517 if((int8_t)c>=0) {
518 return set.contains(c) ? 1 : -1;
519 }
520 int32_t i=length-1;
521 c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
522 length-=i;
523 return set.contains(c) ? length : -length;
524 }
525
526 /*
527 * Note: In span() when spanLength==0 (after a string match, or at the beginning
528 * after an empty code point span) and in spanNot() and spanNotUTF8(),
529 * string matching could use a binary search
530 * because all string matches are done from the same start index.
531 *
532 * For UTF-8, this would require a comparison function that returns UTF-16 order.
533 *
534 * This optimization should not be necessary for normal UnicodeSets because
535 * most sets have no strings, and most sets with strings have
536 * very few very short strings.
537 * For cases with many strings, it might be better to use a different API
538 * and implementation with a DFA (state machine).
539 */
540
541 /*
542 * Algorithm for span(USET_SPAN_CONTAINED)
543 *
544 * Theoretical algorithm:
545 * - Iterate through the string, and at each code point boundary:
546 * + If the code point there is in the set, then remember to continue after it.
547 * + If a set string matches at the current position, then remember to continue after it.
548 * + Either recursively span for each code point or string match,
549 * or recursively span for all but the shortest one and
550 * iteratively continue the span with the shortest local match.
551 * + Remember the longest recursive span (the farthest end point).
552 * + If there is no match at the current position, neither for the code point there
553 * nor for any set string, then stop and return the longest recursive span length.
554 *
555 * Optimized implementation:
556 *
557 * (We assume that most sets will have very few very short strings.
558 * A span using a string-less set is extremely fast.)
559 *
560 * Create and cache a spanSet which contains all of the single code points
561 * of the original set but none of its strings.
562 *
563 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
564 * - Loop:
565 * + Try to match each set string at the end of the spanLength.
566 * ~ Set strings that start with set-contained code points must be matched
567 * with a partial overlap because the recursive algorithm would have tried
568 * to match them at every position.
569 * ~ Set strings that entirely consist of set-contained code points
570 * are irrelevant for span(USET_SPAN_CONTAINED) because the
571 * recursive algorithm would continue after them anyway
572 * and find the longest recursive match from their end.
573 * ~ Rather than recursing, note each end point of a set string match.
574 * + If no set string matched after spanSet.span(), then return
575 * with where the spanSet.span() ended.
576 * + If at least one set string matched after spanSet.span(), then
577 * pop the shortest string match end point and continue
578 * the loop, trying to match all set strings from there.
579 * + If at least one more set string matched after a previous string match,
580 * then test if the code point after the previous string match is also
581 * contained in the set.
582 * Continue the loop with the shortest end point of either this code point
583 * or a matching set string.
584 * + If no more set string matched after a previous string match,
585 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
586 * Stop if spanLength==0, otherwise continue the loop.
587 *
588 * By noting each end point of a set string match,
589 * the function visits each string position at most once and finishes
590 * in linear time.
591 *
592 * The recursive algorithm may visit the same string position many times
593 * if multiple paths lead to it and finishes in exponential time.
594 */
595
596 /*
597 * Algorithm for span(USET_SPAN_SIMPLE)
598 *
599 * Theoretical algorithm:
600 * - Iterate through the string, and at each code point boundary:
601 * + If the code point there is in the set, then remember to continue after it.
602 * + If a set string matches at the current position, then remember to continue after it.
603 * + Continue from the farthest match position and ignore all others.
604 * + If there is no match at the current position,
605 * then stop and return the current position.
606 *
607 * Optimized implementation:
608 *
609 * (Same assumption and spanSet as above.)
610 *
611 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
612 * - Loop:
613 * + Try to match each set string at the end of the spanLength.
614 * ~ Set strings that start with set-contained code points must be matched
615 * with a partial overlap because the standard algorithm would have tried
616 * to match them earlier.
617 * ~ Set strings that entirely consist of set-contained code points
618 * must be matched with a full overlap because the longest-match algorithm
619 * would hide set string matches that end earlier.
620 * Such set strings need not be matched earlier inside the code point span
621 * because the standard algorithm would then have continued after
622 * the set string match anyway.
623 * ~ Remember the longest set string match (farthest end point) from the earliest
624 * starting point.
625 * + If no set string matched after spanSet.span(), then return
626 * with where the spanSet.span() ended.
627 * + If at least one set string matched, then continue the loop after the
628 * longest match from the earliest position.
629 * + If no more set string matched after a previous string match,
630 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
631 * Stop if spanLength==0, otherwise continue the loop.
632 */
633
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const634 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
635 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
636 return spanNot(s, length);
637 }
638 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
639 if(spanLength==length) {
640 return length;
641 }
642
643 // Consider strings; they may overlap with the span.
644 OffsetList offsets;
645 if(spanCondition==USET_SPAN_CONTAINED) {
646 // Use offset list to try all possibilities.
647 offsets.setMaxLength(maxLength16);
648 }
649 int32_t pos=spanLength, rest=length-pos;
650 int32_t i, stringsLength=strings.size();
651 for(;;) {
652 if(spanCondition==USET_SPAN_CONTAINED) {
653 for(i=0; i<stringsLength; ++i) {
654 int32_t overlap=spanLengths[i];
655 if(overlap==ALL_CP_CONTAINED) {
656 continue; // Irrelevant string.
657 }
658 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
659 const UChar *s16=string.getBuffer();
660 int32_t length16=string.length();
661
662 // Try to match this string at pos-overlap..pos.
663 if(overlap>=LONG_SPAN) {
664 overlap=length16;
665 // While contained: No point matching fully inside the code point span.
666 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
667 }
668 if(overlap>spanLength) {
669 overlap=spanLength;
670 }
671 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
672 for(;;) {
673 if(inc>rest) {
674 break;
675 }
676 // Try to match if the increment is not listed already.
677 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
678 if(inc==rest) {
679 return length; // Reached the end of the string.
680 }
681 offsets.addOffset(inc);
682 }
683 if(overlap==0) {
684 break;
685 }
686 --overlap;
687 ++inc;
688 }
689 }
690 } else /* USET_SPAN_SIMPLE */ {
691 int32_t maxInc=0, maxOverlap=0;
692 for(i=0; i<stringsLength; ++i) {
693 int32_t overlap=spanLengths[i];
694 // For longest match, we do need to try to match even an all-contained string
695 // to find the match from the earliest start.
696
697 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
698 const UChar *s16=string.getBuffer();
699 int32_t length16=string.length();
700
701 // Try to match this string at pos-overlap..pos.
702 if(overlap>=LONG_SPAN) {
703 overlap=length16;
704 // Longest match: Need to match fully inside the code point span
705 // to find the match from the earliest start.
706 }
707 if(overlap>spanLength) {
708 overlap=spanLength;
709 }
710 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
711 for(;;) {
712 if(inc>rest || overlap<maxOverlap) {
713 break;
714 }
715 // Try to match if the string is longer or starts earlier.
716 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
717 matches16CPB(s, pos-overlap, length, s16, length16)
718 ) {
719 maxInc=inc; // Longest match from earliest start.
720 maxOverlap=overlap;
721 break;
722 }
723 --overlap;
724 ++inc;
725 }
726 }
727
728 if(maxInc!=0 || maxOverlap!=0) {
729 // Longest-match algorithm, and there was a string match.
730 // Simply continue after it.
731 pos+=maxInc;
732 rest-=maxInc;
733 if(rest==0) {
734 return length; // Reached the end of the string.
735 }
736 spanLength=0; // Match strings from after a string match.
737 continue;
738 }
739 }
740 // Finished trying to match all strings at pos.
741
742 if(spanLength!=0 || pos==0) {
743 // The position is after an unlimited code point span (spanLength!=0),
744 // not after a string match.
745 // The only position where spanLength==0 after a span is pos==0.
746 // Otherwise, an unlimited code point span is only tried again when no
747 // strings match, and if such a non-initial span fails we stop.
748 if(offsets.isEmpty()) {
749 return pos; // No strings matched after a span.
750 }
751 // Match strings from after the next string match.
752 } else {
753 // The position is after a string match (or a single code point).
754 if(offsets.isEmpty()) {
755 // No more strings matched after a previous string match.
756 // Try another code point span from after the last string match.
757 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
758 if( spanLength==rest || // Reached the end of the string, or
759 spanLength==0 // neither strings nor span progressed.
760 ) {
761 return pos+spanLength;
762 }
763 pos+=spanLength;
764 rest-=spanLength;
765 continue; // spanLength>0: Match strings from after a span.
766 } else {
767 // Try to match only one code point from after a string match if some
768 // string matched beyond it, so that we try all possible positions
769 // and don't overshoot.
770 spanLength=spanOne(spanSet, s+pos, rest);
771 if(spanLength>0) {
772 if(spanLength==rest) {
773 return length; // Reached the end of the string.
774 }
775 // Match strings after this code point.
776 // There cannot be any increments below it because UnicodeSet strings
777 // contain multiple code points.
778 pos+=spanLength;
779 rest-=spanLength;
780 offsets.shift(spanLength);
781 spanLength=0;
782 continue; // Match strings from after a single code point.
783 }
784 // Match strings from after the next string match.
785 }
786 }
787 int32_t minOffset=offsets.popMinimum();
788 pos+=minOffset;
789 rest-=minOffset;
790 spanLength=0; // Match strings from after a string match.
791 }
792 }
793
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const794 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
795 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
796 return spanNotBack(s, length);
797 }
798 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
799 if(pos==0) {
800 return 0;
801 }
802 int32_t spanLength=length-pos;
803
804 // Consider strings; they may overlap with the span.
805 OffsetList offsets;
806 if(spanCondition==USET_SPAN_CONTAINED) {
807 // Use offset list to try all possibilities.
808 offsets.setMaxLength(maxLength16);
809 }
810 int32_t i, stringsLength=strings.size();
811 uint8_t *spanBackLengths=spanLengths;
812 if(all) {
813 spanBackLengths+=stringsLength;
814 }
815 for(;;) {
816 if(spanCondition==USET_SPAN_CONTAINED) {
817 for(i=0; i<stringsLength; ++i) {
818 int32_t overlap=spanBackLengths[i];
819 if(overlap==ALL_CP_CONTAINED) {
820 continue; // Irrelevant string.
821 }
822 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
823 const UChar *s16=string.getBuffer();
824 int32_t length16=string.length();
825
826 // Try to match this string at pos-(length16-overlap)..pos-length16.
827 if(overlap>=LONG_SPAN) {
828 overlap=length16;
829 // While contained: No point matching fully inside the code point span.
830 int32_t len1=0;
831 U16_FWD_1(s16, len1, overlap);
832 overlap-=len1; // Length of the string minus the first code point.
833 }
834 if(overlap>spanLength) {
835 overlap=spanLength;
836 }
837 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
838 for(;;) {
839 if(dec>pos) {
840 break;
841 }
842 // Try to match if the decrement is not listed already.
843 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
844 if(dec==pos) {
845 return 0; // Reached the start of the string.
846 }
847 offsets.addOffset(dec);
848 }
849 if(overlap==0) {
850 break;
851 }
852 --overlap;
853 ++dec;
854 }
855 }
856 } else /* USET_SPAN_SIMPLE */ {
857 int32_t maxDec=0, maxOverlap=0;
858 for(i=0; i<stringsLength; ++i) {
859 int32_t overlap=spanBackLengths[i];
860 // For longest match, we do need to try to match even an all-contained string
861 // to find the match from the latest end.
862
863 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
864 const UChar *s16=string.getBuffer();
865 int32_t length16=string.length();
866
867 // Try to match this string at pos-(length16-overlap)..pos-length16.
868 if(overlap>=LONG_SPAN) {
869 overlap=length16;
870 // Longest match: Need to match fully inside the code point span
871 // to find the match from the latest end.
872 }
873 if(overlap>spanLength) {
874 overlap=spanLength;
875 }
876 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
877 for(;;) {
878 if(dec>pos || overlap<maxOverlap) {
879 break;
880 }
881 // Try to match if the string is longer or ends later.
882 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
883 matches16CPB(s, pos-dec, length, s16, length16)
884 ) {
885 maxDec=dec; // Longest match from latest end.
886 maxOverlap=overlap;
887 break;
888 }
889 --overlap;
890 ++dec;
891 }
892 }
893
894 if(maxDec!=0 || maxOverlap!=0) {
895 // Longest-match algorithm, and there was a string match.
896 // Simply continue before it.
897 pos-=maxDec;
898 if(pos==0) {
899 return 0; // Reached the start of the string.
900 }
901 spanLength=0; // Match strings from before a string match.
902 continue;
903 }
904 }
905 // Finished trying to match all strings at pos.
906
907 if(spanLength!=0 || pos==length) {
908 // The position is before an unlimited code point span (spanLength!=0),
909 // not before a string match.
910 // The only position where spanLength==0 before a span is pos==length.
911 // Otherwise, an unlimited code point span is only tried again when no
912 // strings match, and if such a non-initial span fails we stop.
913 if(offsets.isEmpty()) {
914 return pos; // No strings matched before a span.
915 }
916 // Match strings from before the next string match.
917 } else {
918 // The position is before a string match (or a single code point).
919 if(offsets.isEmpty()) {
920 // No more strings matched before a previous string match.
921 // Try another code point span from before the last string match.
922 int32_t oldPos=pos;
923 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
924 spanLength=oldPos-pos;
925 if( pos==0 || // Reached the start of the string, or
926 spanLength==0 // neither strings nor span progressed.
927 ) {
928 return pos;
929 }
930 continue; // spanLength>0: Match strings from before a span.
931 } else {
932 // Try to match only one code point from before a string match if some
933 // string matched beyond it, so that we try all possible positions
934 // and don't overshoot.
935 spanLength=spanOneBack(spanSet, s, pos);
936 if(spanLength>0) {
937 if(spanLength==pos) {
938 return 0; // Reached the start of the string.
939 }
940 // Match strings before this code point.
941 // There cannot be any decrements below it because UnicodeSet strings
942 // contain multiple code points.
943 pos-=spanLength;
944 offsets.shift(spanLength);
945 spanLength=0;
946 continue; // Match strings from before a single code point.
947 }
948 // Match strings from before the next string match.
949 }
950 }
951 pos-=offsets.popMinimum();
952 spanLength=0; // Match strings from before a string match.
953 }
954 }
955
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const956 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
957 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
958 return spanNotUTF8(s, length);
959 }
960 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
961 if(spanLength==length) {
962 return length;
963 }
964
965 // Consider strings; they may overlap with the span.
966 OffsetList offsets;
967 if(spanCondition==USET_SPAN_CONTAINED) {
968 // Use offset list to try all possibilities.
969 offsets.setMaxLength(maxLength8);
970 }
971 int32_t pos=spanLength, rest=length-pos;
972 int32_t i, stringsLength=strings.size();
973 uint8_t *spanUTF8Lengths=spanLengths;
974 if(all) {
975 spanUTF8Lengths+=2*stringsLength;
976 }
977 for(;;) {
978 const uint8_t *s8=utf8;
979 int32_t length8;
980 if(spanCondition==USET_SPAN_CONTAINED) {
981 for(i=0; i<stringsLength; ++i) {
982 length8=utf8Lengths[i];
983 if(length8==0) {
984 continue; // String not representable in UTF-8.
985 }
986 int32_t overlap=spanUTF8Lengths[i];
987 if(overlap==ALL_CP_CONTAINED) {
988 s8+=length8;
989 continue; // Irrelevant string.
990 }
991
992 // Try to match this string at pos-overlap..pos.
993 if(overlap>=LONG_SPAN) {
994 overlap=length8;
995 // While contained: No point matching fully inside the code point span.
996 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
997 }
998 if(overlap>spanLength) {
999 overlap=spanLength;
1000 }
1001 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1002 for(;;) {
1003 if(inc>rest) {
1004 break;
1005 }
1006 // Try to match if the increment is not listed already.
1007 // Match at code point boundaries. (The UTF-8 strings were converted
1008 // from UTF-16 and are guaranteed to be well-formed.)
1009 if( !U8_IS_TRAIL(s[pos-overlap]) &&
1010 !offsets.containsOffset(inc) &&
1011 matches8(s+pos-overlap, s8, length8)
1012
1013 ) {
1014 if(inc==rest) {
1015 return length; // Reached the end of the string.
1016 }
1017 offsets.addOffset(inc);
1018 }
1019 if(overlap==0) {
1020 break;
1021 }
1022 --overlap;
1023 ++inc;
1024 }
1025 s8+=length8;
1026 }
1027 } else /* USET_SPAN_SIMPLE */ {
1028 int32_t maxInc=0, maxOverlap=0;
1029 for(i=0; i<stringsLength; ++i) {
1030 length8=utf8Lengths[i];
1031 if(length8==0) {
1032 continue; // String not representable in UTF-8.
1033 }
1034 int32_t overlap=spanUTF8Lengths[i];
1035 // For longest match, we do need to try to match even an all-contained string
1036 // to find the match from the earliest start.
1037
1038 // Try to match this string at pos-overlap..pos.
1039 if(overlap>=LONG_SPAN) {
1040 overlap=length8;
1041 // Longest match: Need to match fully inside the code point span
1042 // to find the match from the earliest start.
1043 }
1044 if(overlap>spanLength) {
1045 overlap=spanLength;
1046 }
1047 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1048 for(;;) {
1049 if(inc>rest || overlap<maxOverlap) {
1050 break;
1051 }
1052 // Try to match if the string is longer or starts earlier.
1053 // Match at code point boundaries. (The UTF-8 strings were converted
1054 // from UTF-16 and are guaranteed to be well-formed.)
1055 if( !U8_IS_TRAIL(s[pos-overlap]) &&
1056 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1057 matches8(s+pos-overlap, s8, length8)
1058
1059 ) {
1060 maxInc=inc; // Longest match from earliest start.
1061 maxOverlap=overlap;
1062 break;
1063 }
1064 --overlap;
1065 ++inc;
1066 }
1067 s8+=length8;
1068 }
1069
1070 if(maxInc!=0 || maxOverlap!=0) {
1071 // Longest-match algorithm, and there was a string match.
1072 // Simply continue after it.
1073 pos+=maxInc;
1074 rest-=maxInc;
1075 if(rest==0) {
1076 return length; // Reached the end of the string.
1077 }
1078 spanLength=0; // Match strings from after a string match.
1079 continue;
1080 }
1081 }
1082 // Finished trying to match all strings at pos.
1083
1084 if(spanLength!=0 || pos==0) {
1085 // The position is after an unlimited code point span (spanLength!=0),
1086 // not after a string match.
1087 // The only position where spanLength==0 after a span is pos==0.
1088 // Otherwise, an unlimited code point span is only tried again when no
1089 // strings match, and if such a non-initial span fails we stop.
1090 if(offsets.isEmpty()) {
1091 return pos; // No strings matched after a span.
1092 }
1093 // Match strings from after the next string match.
1094 } else {
1095 // The position is after a string match (or a single code point).
1096 if(offsets.isEmpty()) {
1097 // No more strings matched after a previous string match.
1098 // Try another code point span from after the last string match.
1099 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1100 if( spanLength==rest || // Reached the end of the string, or
1101 spanLength==0 // neither strings nor span progressed.
1102 ) {
1103 return pos+spanLength;
1104 }
1105 pos+=spanLength;
1106 rest-=spanLength;
1107 continue; // spanLength>0: Match strings from after a span.
1108 } else {
1109 // Try to match only one code point from after a string match if some
1110 // string matched beyond it, so that we try all possible positions
1111 // and don't overshoot.
1112 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1113 if(spanLength>0) {
1114 if(spanLength==rest) {
1115 return length; // Reached the end of the string.
1116 }
1117 // Match strings after this code point.
1118 // There cannot be any increments below it because UnicodeSet strings
1119 // contain multiple code points.
1120 pos+=spanLength;
1121 rest-=spanLength;
1122 offsets.shift(spanLength);
1123 spanLength=0;
1124 continue; // Match strings from after a single code point.
1125 }
1126 // Match strings from after the next string match.
1127 }
1128 }
1129 int32_t minOffset=offsets.popMinimum();
1130 pos+=minOffset;
1131 rest-=minOffset;
1132 spanLength=0; // Match strings from after a string match.
1133 }
1134 }
1135
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const1136 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1137 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1138 return spanNotBackUTF8(s, length);
1139 }
1140 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1141 if(pos==0) {
1142 return 0;
1143 }
1144 int32_t spanLength=length-pos;
1145
1146 // Consider strings; they may overlap with the span.
1147 OffsetList offsets;
1148 if(spanCondition==USET_SPAN_CONTAINED) {
1149 // Use offset list to try all possibilities.
1150 offsets.setMaxLength(maxLength8);
1151 }
1152 int32_t i, stringsLength=strings.size();
1153 uint8_t *spanBackUTF8Lengths=spanLengths;
1154 if(all) {
1155 spanBackUTF8Lengths+=3*stringsLength;
1156 }
1157 for(;;) {
1158 const uint8_t *s8=utf8;
1159 int32_t length8;
1160 if(spanCondition==USET_SPAN_CONTAINED) {
1161 for(i=0; i<stringsLength; ++i) {
1162 length8=utf8Lengths[i];
1163 if(length8==0) {
1164 continue; // String not representable in UTF-8.
1165 }
1166 int32_t overlap=spanBackUTF8Lengths[i];
1167 if(overlap==ALL_CP_CONTAINED) {
1168 s8+=length8;
1169 continue; // Irrelevant string.
1170 }
1171
1172 // Try to match this string at pos-(length8-overlap)..pos-length8.
1173 if(overlap>=LONG_SPAN) {
1174 overlap=length8;
1175 // While contained: No point matching fully inside the code point span.
1176 int32_t len1=0;
1177 U8_FWD_1(s8, len1, overlap);
1178 overlap-=len1; // Length of the string minus the first code point.
1179 }
1180 if(overlap>spanLength) {
1181 overlap=spanLength;
1182 }
1183 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1184 for(;;) {
1185 if(dec>pos) {
1186 break;
1187 }
1188 // Try to match if the decrement is not listed already.
1189 // Match at code point boundaries. (The UTF-8 strings were converted
1190 // from UTF-16 and are guaranteed to be well-formed.)
1191 if( !U8_IS_TRAIL(s[pos-dec]) &&
1192 !offsets.containsOffset(dec) &&
1193 matches8(s+pos-dec, s8, length8)
1194 ) {
1195 if(dec==pos) {
1196 return 0; // Reached the start of the string.
1197 }
1198 offsets.addOffset(dec);
1199 }
1200 if(overlap==0) {
1201 break;
1202 }
1203 --overlap;
1204 ++dec;
1205 }
1206 s8+=length8;
1207 }
1208 } else /* USET_SPAN_SIMPLE */ {
1209 int32_t maxDec=0, maxOverlap=0;
1210 for(i=0; i<stringsLength; ++i) {
1211 length8=utf8Lengths[i];
1212 if(length8==0) {
1213 continue; // String not representable in UTF-8.
1214 }
1215 int32_t overlap=spanBackUTF8Lengths[i];
1216 // For longest match, we do need to try to match even an all-contained string
1217 // to find the match from the latest end.
1218
1219 // Try to match this string at pos-(length8-overlap)..pos-length8.
1220 if(overlap>=LONG_SPAN) {
1221 overlap=length8;
1222 // Longest match: Need to match fully inside the code point span
1223 // to find the match from the latest end.
1224 }
1225 if(overlap>spanLength) {
1226 overlap=spanLength;
1227 }
1228 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1229 for(;;) {
1230 if(dec>pos || overlap<maxOverlap) {
1231 break;
1232 }
1233 // Try to match if the string is longer or ends later.
1234 // Match at code point boundaries. (The UTF-8 strings were converted
1235 // from UTF-16 and are guaranteed to be well-formed.)
1236 if( !U8_IS_TRAIL(s[pos-dec]) &&
1237 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1238 matches8(s+pos-dec, s8, length8)
1239 ) {
1240 maxDec=dec; // Longest match from latest end.
1241 maxOverlap=overlap;
1242 break;
1243 }
1244 --overlap;
1245 ++dec;
1246 }
1247 s8+=length8;
1248 }
1249
1250 if(maxDec!=0 || maxOverlap!=0) {
1251 // Longest-match algorithm, and there was a string match.
1252 // Simply continue before it.
1253 pos-=maxDec;
1254 if(pos==0) {
1255 return 0; // Reached the start of the string.
1256 }
1257 spanLength=0; // Match strings from before a string match.
1258 continue;
1259 }
1260 }
1261 // Finished trying to match all strings at pos.
1262
1263 if(spanLength!=0 || pos==length) {
1264 // The position is before an unlimited code point span (spanLength!=0),
1265 // not before a string match.
1266 // The only position where spanLength==0 before a span is pos==length.
1267 // Otherwise, an unlimited code point span is only tried again when no
1268 // strings match, and if such a non-initial span fails we stop.
1269 if(offsets.isEmpty()) {
1270 return pos; // No strings matched before a span.
1271 }
1272 // Match strings from before the next string match.
1273 } else {
1274 // The position is before a string match (or a single code point).
1275 if(offsets.isEmpty()) {
1276 // No more strings matched before a previous string match.
1277 // Try another code point span from before the last string match.
1278 int32_t oldPos=pos;
1279 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1280 spanLength=oldPos-pos;
1281 if( pos==0 || // Reached the start of the string, or
1282 spanLength==0 // neither strings nor span progressed.
1283 ) {
1284 return pos;
1285 }
1286 continue; // spanLength>0: Match strings from before a span.
1287 } else {
1288 // Try to match only one code point from before a string match if some
1289 // string matched beyond it, so that we try all possible positions
1290 // and don't overshoot.
1291 spanLength=spanOneBackUTF8(spanSet, s, pos);
1292 if(spanLength>0) {
1293 if(spanLength==pos) {
1294 return 0; // Reached the start of the string.
1295 }
1296 // Match strings before this code point.
1297 // There cannot be any decrements below it because UnicodeSet strings
1298 // contain multiple code points.
1299 pos-=spanLength;
1300 offsets.shift(spanLength);
1301 spanLength=0;
1302 continue; // Match strings from before a single code point.
1303 }
1304 // Match strings from before the next string match.
1305 }
1306 }
1307 pos-=offsets.popMinimum();
1308 spanLength=0; // Match strings from before a string match.
1309 }
1310 }
1311
1312 /*
1313 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1314 *
1315 * Theoretical algorithm:
1316 * - Iterate through the string, and at each code point boundary:
1317 * + If the code point there is in the set, then return with the current position.
1318 * + If a set string matches at the current position, then return with the current position.
1319 *
1320 * Optimized implementation:
1321 *
1322 * (Same assumption as for span() above.)
1323 *
1324 * Create and cache a spanNotSet which contains all of the single code points
1325 * of the original set but none of its strings.
1326 * For each set string add its initial code point to the spanNotSet.
1327 * (Also add its final code point for spanNotBack().)
1328 *
1329 * - Loop:
1330 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1331 * + If the current code point is in the original set, then
1332 * return the current position.
1333 * + If any set string matches at the current position, then
1334 * return the current position.
1335 * + If there is no match at the current position, neither for the code point there
1336 * nor for any set string, then skip this code point and continue the loop.
1337 * This happens for set-string-initial code points that were added to spanNotSet
1338 * when there is not actually a match for such a set string.
1339 */
1340
spanNot(const UChar * s,int32_t length) const1341 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1342 int32_t pos=0, rest=length;
1343 int32_t i, stringsLength=strings.size();
1344 do {
1345 // Span until we find a code point from the set,
1346 // or a code point that starts or ends some string.
1347 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1348 if(i==rest) {
1349 return length; // Reached the end of the string.
1350 }
1351 pos+=i;
1352 rest-=i;
1353
1354 // Check whether the current code point is in the original set,
1355 // without the string starts and ends.
1356 int32_t cpLength=spanOne(spanSet, s+pos, rest);
1357 if(cpLength>0) {
1358 return pos; // There is a set element at pos.
1359 }
1360
1361 // Try to match the strings at pos.
1362 for(i=0; i<stringsLength; ++i) {
1363 if(spanLengths[i]==ALL_CP_CONTAINED) {
1364 continue; // Irrelevant string.
1365 }
1366 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1367 const UChar *s16=string.getBuffer();
1368 int32_t length16=string.length();
1369 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1370 return pos; // There is a set element at pos.
1371 }
1372 }
1373
1374 // The span(while not contained) ended on a string start/end which is
1375 // not in the original set. Skip this code point and continue.
1376 // cpLength<0
1377 pos-=cpLength;
1378 rest+=cpLength;
1379 } while(rest!=0);
1380 return length; // Reached the end of the string.
1381 }
1382
spanNotBack(const UChar * s,int32_t length) const1383 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1384 int32_t pos=length;
1385 int32_t i, stringsLength=strings.size();
1386 do {
1387 // Span until we find a code point from the set,
1388 // or a code point that starts or ends some string.
1389 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1390 if(pos==0) {
1391 return 0; // Reached the start of the string.
1392 }
1393
1394 // Check whether the current code point is in the original set,
1395 // without the string starts and ends.
1396 int32_t cpLength=spanOneBack(spanSet, s, pos);
1397 if(cpLength>0) {
1398 return pos; // There is a set element at pos.
1399 }
1400
1401 // Try to match the strings at pos.
1402 for(i=0; i<stringsLength; ++i) {
1403 // Use spanLengths rather than a spanBackLengths pointer because
1404 // it is easier and we only need to know whether the string is irrelevant
1405 // which is the same in either array.
1406 if(spanLengths[i]==ALL_CP_CONTAINED) {
1407 continue; // Irrelevant string.
1408 }
1409 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1410 const UChar *s16=string.getBuffer();
1411 int32_t length16=string.length();
1412 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1413 return pos; // There is a set element at pos.
1414 }
1415 }
1416
1417 // The span(while not contained) ended on a string start/end which is
1418 // not in the original set. Skip this code point and continue.
1419 // cpLength<0
1420 pos+=cpLength;
1421 } while(pos!=0);
1422 return 0; // Reached the start of the string.
1423 }
1424
spanNotUTF8(const uint8_t * s,int32_t length) const1425 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1426 int32_t pos=0, rest=length;
1427 int32_t i, stringsLength=strings.size();
1428 uint8_t *spanUTF8Lengths=spanLengths;
1429 if(all) {
1430 spanUTF8Lengths+=2*stringsLength;
1431 }
1432 do {
1433 // Span until we find a code point from the set,
1434 // or a code point that starts or ends some string.
1435 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1436 if(i==rest) {
1437 return length; // Reached the end of the string.
1438 }
1439 pos+=i;
1440 rest-=i;
1441
1442 // Check whether the current code point is in the original set,
1443 // without the string starts and ends.
1444 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1445 if(cpLength>0) {
1446 return pos; // There is a set element at pos.
1447 }
1448
1449 // Try to match the strings at pos.
1450 const uint8_t *s8=utf8;
1451 int32_t length8;
1452 for(i=0; i<stringsLength; ++i) {
1453 length8=utf8Lengths[i];
1454 // ALL_CP_CONTAINED: Irrelevant string.
1455 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1456 return pos; // There is a set element at pos.
1457 }
1458 s8+=length8;
1459 }
1460
1461 // The span(while not contained) ended on a string start/end which is
1462 // not in the original set. Skip this code point and continue.
1463 // cpLength<0
1464 pos-=cpLength;
1465 rest+=cpLength;
1466 } while(rest!=0);
1467 return length; // Reached the end of the string.
1468 }
1469
spanNotBackUTF8(const uint8_t * s,int32_t length) const1470 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1471 int32_t pos=length;
1472 int32_t i, stringsLength=strings.size();
1473 uint8_t *spanBackUTF8Lengths=spanLengths;
1474 if(all) {
1475 spanBackUTF8Lengths+=3*stringsLength;
1476 }
1477 do {
1478 // Span until we find a code point from the set,
1479 // or a code point that starts or ends some string.
1480 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1481 if(pos==0) {
1482 return 0; // Reached the start of the string.
1483 }
1484
1485 // Check whether the current code point is in the original set,
1486 // without the string starts and ends.
1487 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1488 if(cpLength>0) {
1489 return pos; // There is a set element at pos.
1490 }
1491
1492 // Try to match the strings at pos.
1493 const uint8_t *s8=utf8;
1494 int32_t length8;
1495 for(i=0; i<stringsLength; ++i) {
1496 length8=utf8Lengths[i];
1497 // ALL_CP_CONTAINED: Irrelevant string.
1498 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1499 return pos; // There is a set element at pos.
1500 }
1501 s8+=length8;
1502 }
1503
1504 // The span(while not contained) ended on a string start/end which is
1505 // not in the original set. Skip this code point and continue.
1506 // cpLength<0
1507 pos+=cpLength;
1508 } while(pos!=0);
1509 return 0; // Reached the start of the string.
1510 }
1511
1512 U_NAMESPACE_END
1513