1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 * Copyright (C) 2007-2012, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 ******************************************************************************
10 * file name: unisetspan.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2007mar01
16 * created by: Markus W. Scherer
17 */
18
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/ustring.h"
22 #include "unicode/utf8.h"
23 #include "unicode/utf16.h"
24 #include "cmemory.h"
25 #include "uvector.h"
26 #include "unisetspan.h"
27
28 U_NAMESPACE_BEGIN
29
30 /*
31 * List of offsets from the current position from where to try matching
32 * a code point or a string.
33 * Store offsets rather than indexes to simplify the code and use the same list
34 * for both increments (in span()) and decrements (in spanBack()).
35 *
36 * Assumption: The maximum offset is limited, and the offsets that are stored
37 * at any one time are relatively dense, that is, there are normally no gaps of
38 * hundreds or thousands of offset values.
39 *
40 * The implementation uses a circular buffer of byte flags,
41 * each indicating whether the corresponding offset is in the list.
42 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
43 * physically moving part of the list.
44 *
45 * Note: In principle, the caller should setMaxLength() to the maximum of the
46 * max string length and U16_LENGTH/U8_LENGTH to account for
47 * "long" single code points.
48 * However, this implementation uses at least a staticList with more than
49 * U8_LENGTH entries anyway.
50 *
51 * Note: If maxLength were guaranteed to be no more than 32 or 64,
52 * the list could be stored as bit flags in a single integer.
53 * Rather than handling a circular buffer with a start list index,
54 * the integer would simply be shifted when lower offsets are removed.
55 * UnicodeSet does not have a limit on the lengths of strings.
56 */
57 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
58 public:
OffsetList()59 OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
60
~OffsetList()61 ~OffsetList() {
62 if(list!=staticList) {
63 uprv_free(list);
64 }
65 }
66
67 // Call exactly once if the list is to be used.
setMaxLength(int32_t maxLength)68 void setMaxLength(int32_t maxLength) {
69 if(maxLength<=(int32_t)sizeof(staticList)) {
70 capacity=(int32_t)sizeof(staticList);
71 } else {
72 UBool *l=(UBool *)uprv_malloc(maxLength);
73 if(l!=NULL) {
74 list=l;
75 capacity=maxLength;
76 }
77 }
78 uprv_memset(list, 0, capacity);
79 }
80
clear()81 void clear() {
82 uprv_memset(list, 0, capacity);
83 start=length=0;
84 }
85
isEmpty() const86 UBool isEmpty() const {
87 return (UBool)(length==0);
88 }
89
90 // Reduce all stored offsets by delta, used when the current position
91 // moves by delta.
92 // There must not be any offsets lower than delta.
93 // If there is an offset equal to delta, it is removed.
94 // delta=[1..maxLength]
shift(int32_t delta)95 void shift(int32_t delta) {
96 int32_t i=start+delta;
97 if(i>=capacity) {
98 i-=capacity;
99 }
100 if(list[i]) {
101 list[i]=FALSE;
102 --length;
103 }
104 start=i;
105 }
106
107 // Add an offset. The list must not contain it yet.
108 // offset=[1..maxLength]
addOffset(int32_t offset)109 void addOffset(int32_t offset) {
110 int32_t i=start+offset;
111 if(i>=capacity) {
112 i-=capacity;
113 }
114 list[i]=TRUE;
115 ++length;
116 }
117
118 // offset=[1..maxLength]
containsOffset(int32_t offset) const119 UBool containsOffset(int32_t offset) const {
120 int32_t i=start+offset;
121 if(i>=capacity) {
122 i-=capacity;
123 }
124 return list[i];
125 }
126
127 // Find the lowest stored offset from a non-empty list, remove it,
128 // and reduce all other offsets by this minimum.
129 // Returns [1..maxLength].
popMinimum()130 int32_t popMinimum() {
131 // Look for the next offset in list[start+1..capacity-1].
132 int32_t i=start, result;
133 while(++i<capacity) {
134 if(list[i]) {
135 list[i]=FALSE;
136 --length;
137 result=i-start;
138 start=i;
139 return result;
140 }
141 }
142 // i==capacity
143
144 // Wrap around and look for the next offset in list[0..start].
145 // Since the list is not empty, there will be one.
146 result=capacity-start;
147 i=0;
148 while(!list[i]) {
149 ++i;
150 }
151 list[i]=FALSE;
152 --length;
153 start=i;
154 return result+=i;
155 }
156
157 private:
158 UBool *list;
159 int32_t capacity;
160 int32_t length;
161 int32_t start;
162
163 UBool staticList[16];
164 };
165
166 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
167 static int32_t
getUTF8Length(const UChar * s,int32_t length)168 getUTF8Length(const UChar *s, int32_t length) {
169 UErrorCode errorCode=U_ZERO_ERROR;
170 int32_t length8=0;
171 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
172 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
173 return length8;
174 } else {
175 // The string contains an unpaired surrogate.
176 // Ignore this string.
177 return 0;
178 }
179 }
180
181 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
182 static int32_t
appendUTF8(const UChar * s,int32_t length,uint8_t * t,int32_t capacity)183 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
184 UErrorCode errorCode=U_ZERO_ERROR;
185 int32_t length8=0;
186 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
187 if(U_SUCCESS(errorCode)) {
188 return length8;
189 } else {
190 // The string contains an unpaired surrogate.
191 // Ignore this string.
192 return 0;
193 }
194 }
195
196 static inline uint8_t
makeSpanLengthByte(int32_t spanLength)197 makeSpanLengthByte(int32_t spanLength) {
198 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
199 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
200 }
201
202 // Construct for all variants of span(), or only for any one variant.
203 // Initialize as little as possible, for single use.
UnicodeSetStringSpan(const UnicodeSet & set,const UVector & setStrings,uint32_t which)204 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
205 const UVector &setStrings,
206 uint32_t which)
207 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
208 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
209 utf8Length(0),
210 maxLength16(0), maxLength8(0),
211 all((UBool)(which==ALL)) {
212 spanSet.retainAll(set);
213 if(which&NOT_CONTAINED) {
214 // Default to the same sets.
215 // addToSpanNotSet() will create a separate set if necessary.
216 pSpanNotSet=&spanSet;
217 }
218
219 // Determine if the strings even need to be taken into account at all for span() etc.
220 // If any string is relevant, then all strings need to be used for
221 // span(longest match) but only the relevant ones for span(while contained).
222 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
223 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
224 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
225 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
226 int32_t stringsLength=strings.size();
227
228 int32_t i, spanLength;
229 UBool someRelevant=FALSE;
230 for(i=0; i<stringsLength; ++i) {
231 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
232 const UChar *s16=string.getBuffer();
233 int32_t length16=string.length();
234 if (length16==0) {
235 continue; // skip the empty string
236 }
237 UBool thisRelevant;
238 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
239 if(spanLength<length16) { // Relevant string.
240 someRelevant=thisRelevant=TRUE;
241 } else {
242 thisRelevant=FALSE;
243 }
244 if((which&UTF16) && length16>maxLength16) {
245 maxLength16=length16;
246 }
247 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
248 int32_t length8=getUTF8Length(s16, length16);
249 utf8Length+=length8;
250 if(length8>maxLength8) {
251 maxLength8=length8;
252 }
253 }
254 }
255 if(!someRelevant) {
256 maxLength16=maxLength8=0;
257 return;
258 }
259
260 // Freeze after checking for the need to use strings at all because freezing
261 // a set takes some time and memory which are wasted if there are no relevant strings.
262 if(all) {
263 spanSet.freeze();
264 }
265
266 uint8_t *spanBackLengths;
267 uint8_t *spanUTF8Lengths;
268 uint8_t *spanBackUTF8Lengths;
269
270 // Allocate a block of meta data.
271 int32_t allocSize;
272 if(all) {
273 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
274 allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
275 } else {
276 allocSize=stringsLength; // One set of span lengths.
277 if(which&UTF8) {
278 // UTF-8 lengths and UTF-8 strings.
279 allocSize+=stringsLength*4+utf8Length;
280 }
281 }
282 if(allocSize<=(int32_t)sizeof(staticLengths)) {
283 utf8Lengths=staticLengths;
284 } else {
285 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
286 if(utf8Lengths==NULL) {
287 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
288 return; // Out of memory.
289 }
290 }
291
292 if(all) {
293 // Store span lengths for all span() variants.
294 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
295 spanBackLengths=spanLengths+stringsLength;
296 spanUTF8Lengths=spanBackLengths+stringsLength;
297 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
298 utf8=spanBackUTF8Lengths+stringsLength;
299 } else {
300 // Store span lengths for only one span() variant.
301 if(which&UTF8) {
302 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
303 utf8=spanLengths+stringsLength;
304 } else {
305 spanLengths=(uint8_t *)utf8Lengths;
306 }
307 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
308 }
309
310 // Set the meta data and pSpanNotSet and write the UTF-8 strings.
311 int32_t utf8Count=0; // Count UTF-8 bytes written so far.
312
313 for(i=0; i<stringsLength; ++i) {
314 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
315 const UChar *s16=string.getBuffer();
316 int32_t length16=string.length();
317 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
318 if(spanLength<length16 && length16>0) { // Relevant string.
319 if(which&UTF16) {
320 if(which&CONTAINED) {
321 if(which&FWD) {
322 spanLengths[i]=makeSpanLengthByte(spanLength);
323 }
324 if(which&BACK) {
325 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
326 spanBackLengths[i]=makeSpanLengthByte(spanLength);
327 }
328 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
329 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
330 }
331 }
332 if(which&UTF8) {
333 uint8_t *s8=utf8+utf8Count;
334 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
335 utf8Count+=utf8Lengths[i]=length8;
336 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
337 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
338 } else { // Relevant for UTF-8.
339 if(which&CONTAINED) {
340 if(which&FWD) {
341 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
342 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
343 }
344 if(which&BACK) {
345 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
346 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
347 }
348 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
349 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
350 }
351 }
352 }
353 if(which&NOT_CONTAINED) {
354 // Add string start and end code points to the spanNotSet so that
355 // a span(while not contained) stops before any string.
356 UChar32 c;
357 if(which&FWD) {
358 int32_t len=0;
359 U16_NEXT(s16, len, length16, c);
360 addToSpanNotSet(c);
361 }
362 if(which&BACK) {
363 int32_t len=length16;
364 U16_PREV(s16, 0, len, c);
365 addToSpanNotSet(c);
366 }
367 }
368 } else { // Irrelevant string. (Also the empty string.)
369 if(which&UTF8) {
370 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
371 uint8_t *s8=utf8+utf8Count;
372 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
373 utf8Count+=utf8Lengths[i]=length8;
374 } else {
375 utf8Lengths[i]=0;
376 }
377 }
378 if(all) {
379 spanLengths[i]=spanBackLengths[i]=
380 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
381 (uint8_t)ALL_CP_CONTAINED;
382 } else {
383 // All spanXYZLengths pointers contain the same address.
384 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
385 }
386 }
387 }
388
389 // Finish.
390 if(all) {
391 pSpanNotSet->freeze();
392 }
393 }
394
395 // Copy constructor. Assumes which==ALL for a frozen set.
UnicodeSetStringSpan(const UnicodeSetStringSpan & otherStringSpan,const UVector & newParentSetStrings)396 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
397 const UVector &newParentSetStrings)
398 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
399 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
400 utf8Length(otherStringSpan.utf8Length),
401 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
402 all(TRUE) {
403 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
404 pSpanNotSet=&spanSet;
405 } else {
406 pSpanNotSet=otherStringSpan.pSpanNotSet->clone();
407 }
408
409 // Allocate a block of meta data.
410 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
411 int32_t stringsLength=strings.size();
412 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
413 if(allocSize<=(int32_t)sizeof(staticLengths)) {
414 utf8Lengths=staticLengths;
415 } else {
416 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
417 if(utf8Lengths==NULL) {
418 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
419 return; // Out of memory.
420 }
421 }
422
423 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
424 utf8=spanLengths+stringsLength*4;
425 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
426 }
427
~UnicodeSetStringSpan()428 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
429 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
430 delete pSpanNotSet;
431 }
432 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
433 uprv_free(utf8Lengths);
434 }
435 }
436
addToSpanNotSet(UChar32 c)437 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
438 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
439 if(spanSet.contains(c)) {
440 return; // Nothing to do.
441 }
442 UnicodeSet *newSet=spanSet.cloneAsThawed();
443 if(newSet==NULL) {
444 return; // Out of memory.
445 } else {
446 pSpanNotSet=newSet;
447 }
448 }
449 pSpanNotSet->add(c);
450 }
451
452 // Compare strings without any argument checks. Requires length>0.
453 static inline UBool
matches16(const UChar * s,const UChar * t,int32_t length)454 matches16(const UChar *s, const UChar *t, int32_t length) {
455 do {
456 if(*s++!=*t++) {
457 return FALSE;
458 }
459 } while(--length>0);
460 return TRUE;
461 }
462
463 static inline UBool
matches8(const uint8_t * s,const uint8_t * t,int32_t length)464 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
465 do {
466 if(*s++!=*t++) {
467 return FALSE;
468 }
469 } while(--length>0);
470 return TRUE;
471 }
472
473 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
474 // at code point boundaries.
475 // That is, each edge of a match must not be in the middle of a surrogate pair.
476 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UChar * t,int32_t length)477 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
478 s+=start;
479 limit-=start;
480 return matches16(s, t, length) &&
481 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
482 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
483 }
484
485 // Does the set contain the next code point?
486 // If so, return its length; otherwise return its negative length.
487 static inline int32_t
spanOne(const UnicodeSet & set,const UChar * s,int32_t length)488 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
489 UChar c=*s, c2;
490 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
491 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
492 }
493 return set.contains(c) ? 1 : -1;
494 }
495
496 static inline int32_t
spanOneBack(const UnicodeSet & set,const UChar * s,int32_t length)497 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
498 UChar c=s[length-1], c2;
499 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
500 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
501 }
502 return set.contains(c) ? 1 : -1;
503 }
504
505 static inline int32_t
spanOneUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)506 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
507 UChar32 c=*s;
508 if(U8_IS_SINGLE(c)) {
509 return set.contains(c) ? 1 : -1;
510 }
511 // Take advantage of non-ASCII fastpaths in U8_NEXT_OR_FFFD().
512 int32_t i=0;
513 U8_NEXT_OR_FFFD(s, i, length, c);
514 return set.contains(c) ? i : -i;
515 }
516
517 static inline int32_t
spanOneBackUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)518 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
519 UChar32 c=s[length-1];
520 if(U8_IS_SINGLE(c)) {
521 return set.contains(c) ? 1 : -1;
522 }
523 int32_t i=length-1;
524 c=utf8_prevCharSafeBody(s, 0, &i, c, -3);
525 length-=i;
526 return set.contains(c) ? length : -length;
527 }
528
529 /*
530 * Note: In span() when spanLength==0 (after a string match, or at the beginning
531 * after an empty code point span) and in spanNot() and spanNotUTF8(),
532 * string matching could use a binary search
533 * because all string matches are done from the same start index.
534 *
535 * For UTF-8, this would require a comparison function that returns UTF-16 order.
536 *
537 * This optimization should not be necessary for normal UnicodeSets because
538 * most sets have no strings, and most sets with strings have
539 * very few very short strings.
540 * For cases with many strings, it might be better to use a different API
541 * and implementation with a DFA (state machine).
542 */
543
544 /*
545 * Algorithm for span(USET_SPAN_CONTAINED)
546 *
547 * Theoretical algorithm:
548 * - Iterate through the string, and at each code point boundary:
549 * + If the code point there is in the set, then remember to continue after it.
550 * + If a set string matches at the current position, then remember to continue after it.
551 * + Either recursively span for each code point or string match,
552 * or recursively span for all but the shortest one and
553 * iteratively continue the span with the shortest local match.
554 * + Remember the longest recursive span (the farthest end point).
555 * + If there is no match at the current position, neither for the code point there
556 * nor for any set string, then stop and return the longest recursive span length.
557 *
558 * Optimized implementation:
559 *
560 * (We assume that most sets will have very few very short strings.
561 * A span using a string-less set is extremely fast.)
562 *
563 * Create and cache a spanSet which contains all of the single code points
564 * of the original set but none of its strings.
565 *
566 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
567 * - Loop:
568 * + Try to match each set string at the end of the spanLength.
569 * ~ Set strings that start with set-contained code points must be matched
570 * with a partial overlap because the recursive algorithm would have tried
571 * to match them at every position.
572 * ~ Set strings that entirely consist of set-contained code points
573 * are irrelevant for span(USET_SPAN_CONTAINED) because the
574 * recursive algorithm would continue after them anyway
575 * and find the longest recursive match from their end.
576 * ~ Rather than recursing, note each end point of a set string match.
577 * + If no set string matched after spanSet.span(), then return
578 * with where the spanSet.span() ended.
579 * + If at least one set string matched after spanSet.span(), then
580 * pop the shortest string match end point and continue
581 * the loop, trying to match all set strings from there.
582 * + If at least one more set string matched after a previous string match,
583 * then test if the code point after the previous string match is also
584 * contained in the set.
585 * Continue the loop with the shortest end point of either this code point
586 * or a matching set string.
587 * + If no more set string matched after a previous string match,
588 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
589 * Stop if spanLength==0, otherwise continue the loop.
590 *
591 * By noting each end point of a set string match,
592 * the function visits each string position at most once and finishes
593 * in linear time.
594 *
595 * The recursive algorithm may visit the same string position many times
596 * if multiple paths lead to it and finishes in exponential time.
597 */
598
599 /*
600 * Algorithm for span(USET_SPAN_SIMPLE)
601 *
602 * Theoretical algorithm:
603 * - Iterate through the string, and at each code point boundary:
604 * + If the code point there is in the set, then remember to continue after it.
605 * + If a set string matches at the current position, then remember to continue after it.
606 * + Continue from the farthest match position and ignore all others.
607 * + If there is no match at the current position,
608 * then stop and return the current position.
609 *
610 * Optimized implementation:
611 *
612 * (Same assumption and spanSet as above.)
613 *
614 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
615 * - Loop:
616 * + Try to match each set string at the end of the spanLength.
617 * ~ Set strings that start with set-contained code points must be matched
618 * with a partial overlap because the standard algorithm would have tried
619 * to match them earlier.
620 * ~ Set strings that entirely consist of set-contained code points
621 * must be matched with a full overlap because the longest-match algorithm
622 * would hide set string matches that end earlier.
623 * Such set strings need not be matched earlier inside the code point span
624 * because the standard algorithm would then have continued after
625 * the set string match anyway.
626 * ~ Remember the longest set string match (farthest end point) from the earliest
627 * starting point.
628 * + If no set string matched after spanSet.span(), then return
629 * with where the spanSet.span() ended.
630 * + If at least one set string matched, then continue the loop after the
631 * longest match from the earliest position.
632 * + If no more set string matched after a previous string match,
633 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
634 * Stop if spanLength==0, otherwise continue the loop.
635 */
636
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const637 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
638 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
639 return spanNot(s, length);
640 }
641 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
642 if(spanLength==length) {
643 return length;
644 }
645
646 // Consider strings; they may overlap with the span.
647 OffsetList offsets;
648 if(spanCondition==USET_SPAN_CONTAINED) {
649 // Use offset list to try all possibilities.
650 offsets.setMaxLength(maxLength16);
651 }
652 int32_t pos=spanLength, rest=length-pos;
653 int32_t i, stringsLength=strings.size();
654 for(;;) {
655 if(spanCondition==USET_SPAN_CONTAINED) {
656 for(i=0; i<stringsLength; ++i) {
657 int32_t overlap=spanLengths[i];
658 if(overlap==ALL_CP_CONTAINED) {
659 continue; // Irrelevant string. (Also the empty string.)
660 }
661 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
662 const UChar *s16=string.getBuffer();
663 int32_t length16=string.length();
664 U_ASSERT(length>0);
665
666 // Try to match this string at pos-overlap..pos.
667 if(overlap>=LONG_SPAN) {
668 overlap=length16;
669 // While contained: No point matching fully inside the code point span.
670 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
671 }
672 if(overlap>spanLength) {
673 overlap=spanLength;
674 }
675 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
676 for(;;) {
677 if(inc>rest) {
678 break;
679 }
680 // Try to match if the increment is not listed already.
681 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
682 if(inc==rest) {
683 return length; // Reached the end of the string.
684 }
685 offsets.addOffset(inc);
686 }
687 if(overlap==0) {
688 break;
689 }
690 --overlap;
691 ++inc;
692 }
693 }
694 } else /* USET_SPAN_SIMPLE */ {
695 int32_t maxInc=0, maxOverlap=0;
696 for(i=0; i<stringsLength; ++i) {
697 int32_t overlap=spanLengths[i];
698 // For longest match, we do need to try to match even an all-contained string
699 // to find the match from the earliest start.
700
701 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
702 const UChar *s16=string.getBuffer();
703 int32_t length16=string.length();
704 if (length16==0) {
705 continue; // skip the empty string
706 }
707
708 // Try to match this string at pos-overlap..pos.
709 if(overlap>=LONG_SPAN) {
710 overlap=length16;
711 // Longest match: Need to match fully inside the code point span
712 // to find the match from the earliest start.
713 }
714 if(overlap>spanLength) {
715 overlap=spanLength;
716 }
717 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
718 for(;;) {
719 if(inc>rest || overlap<maxOverlap) {
720 break;
721 }
722 // Try to match if the string is longer or starts earlier.
723 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
724 matches16CPB(s, pos-overlap, length, s16, length16)
725 ) {
726 maxInc=inc; // Longest match from earliest start.
727 maxOverlap=overlap;
728 break;
729 }
730 --overlap;
731 ++inc;
732 }
733 }
734
735 if(maxInc!=0 || maxOverlap!=0) {
736 // Longest-match algorithm, and there was a string match.
737 // Simply continue after it.
738 pos+=maxInc;
739 rest-=maxInc;
740 if(rest==0) {
741 return length; // Reached the end of the string.
742 }
743 spanLength=0; // Match strings from after a string match.
744 continue;
745 }
746 }
747 // Finished trying to match all strings at pos.
748
749 if(spanLength!=0 || pos==0) {
750 // The position is after an unlimited code point span (spanLength!=0),
751 // not after a string match.
752 // The only position where spanLength==0 after a span is pos==0.
753 // Otherwise, an unlimited code point span is only tried again when no
754 // strings match, and if such a non-initial span fails we stop.
755 if(offsets.isEmpty()) {
756 return pos; // No strings matched after a span.
757 }
758 // Match strings from after the next string match.
759 } else {
760 // The position is after a string match (or a single code point).
761 if(offsets.isEmpty()) {
762 // No more strings matched after a previous string match.
763 // Try another code point span from after the last string match.
764 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
765 if( spanLength==rest || // Reached the end of the string, or
766 spanLength==0 // neither strings nor span progressed.
767 ) {
768 return pos+spanLength;
769 }
770 pos+=spanLength;
771 rest-=spanLength;
772 continue; // spanLength>0: Match strings from after a span.
773 } else {
774 // Try to match only one code point from after a string match if some
775 // string matched beyond it, so that we try all possible positions
776 // and don't overshoot.
777 spanLength=spanOne(spanSet, s+pos, rest);
778 if(spanLength>0) {
779 if(spanLength==rest) {
780 return length; // Reached the end of the string.
781 }
782 // Match strings after this code point.
783 // There cannot be any increments below it because UnicodeSet strings
784 // contain multiple code points.
785 pos+=spanLength;
786 rest-=spanLength;
787 offsets.shift(spanLength);
788 spanLength=0;
789 continue; // Match strings from after a single code point.
790 }
791 // Match strings from after the next string match.
792 }
793 }
794 int32_t minOffset=offsets.popMinimum();
795 pos+=minOffset;
796 rest-=minOffset;
797 spanLength=0; // Match strings from after a string match.
798 }
799 }
800
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const801 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
802 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
803 return spanNotBack(s, length);
804 }
805 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
806 if(pos==0) {
807 return 0;
808 }
809 int32_t spanLength=length-pos;
810
811 // Consider strings; they may overlap with the span.
812 OffsetList offsets;
813 if(spanCondition==USET_SPAN_CONTAINED) {
814 // Use offset list to try all possibilities.
815 offsets.setMaxLength(maxLength16);
816 }
817 int32_t i, stringsLength=strings.size();
818 uint8_t *spanBackLengths=spanLengths;
819 if(all) {
820 spanBackLengths+=stringsLength;
821 }
822 for(;;) {
823 if(spanCondition==USET_SPAN_CONTAINED) {
824 for(i=0; i<stringsLength; ++i) {
825 int32_t overlap=spanBackLengths[i];
826 if(overlap==ALL_CP_CONTAINED) {
827 continue; // Irrelevant string. (Also the empty string.)
828 }
829 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
830 const UChar *s16=string.getBuffer();
831 int32_t length16=string.length();
832 U_ASSERT(length>0);
833
834 // Try to match this string at pos-(length16-overlap)..pos-length16.
835 if(overlap>=LONG_SPAN) {
836 overlap=length16;
837 // While contained: No point matching fully inside the code point span.
838 int32_t len1=0;
839 U16_FWD_1(s16, len1, overlap);
840 overlap-=len1; // Length of the string minus the first code point.
841 }
842 if(overlap>spanLength) {
843 overlap=spanLength;
844 }
845 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
846 for(;;) {
847 if(dec>pos) {
848 break;
849 }
850 // Try to match if the decrement is not listed already.
851 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
852 if(dec==pos) {
853 return 0; // Reached the start of the string.
854 }
855 offsets.addOffset(dec);
856 }
857 if(overlap==0) {
858 break;
859 }
860 --overlap;
861 ++dec;
862 }
863 }
864 } else /* USET_SPAN_SIMPLE */ {
865 int32_t maxDec=0, maxOverlap=0;
866 for(i=0; i<stringsLength; ++i) {
867 int32_t overlap=spanBackLengths[i];
868 // For longest match, we do need to try to match even an all-contained string
869 // to find the match from the latest end.
870
871 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
872 const UChar *s16=string.getBuffer();
873 int32_t length16=string.length();
874 if (length16==0) {
875 continue; // skip the empty string
876 }
877
878 // Try to match this string at pos-(length16-overlap)..pos-length16.
879 if(overlap>=LONG_SPAN) {
880 overlap=length16;
881 // Longest match: Need to match fully inside the code point span
882 // to find the match from the latest end.
883 }
884 if(overlap>spanLength) {
885 overlap=spanLength;
886 }
887 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
888 for(;;) {
889 if(dec>pos || overlap<maxOverlap) {
890 break;
891 }
892 // Try to match if the string is longer or ends later.
893 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
894 matches16CPB(s, pos-dec, length, s16, length16)
895 ) {
896 maxDec=dec; // Longest match from latest end.
897 maxOverlap=overlap;
898 break;
899 }
900 --overlap;
901 ++dec;
902 }
903 }
904
905 if(maxDec!=0 || maxOverlap!=0) {
906 // Longest-match algorithm, and there was a string match.
907 // Simply continue before it.
908 pos-=maxDec;
909 if(pos==0) {
910 return 0; // Reached the start of the string.
911 }
912 spanLength=0; // Match strings from before a string match.
913 continue;
914 }
915 }
916 // Finished trying to match all strings at pos.
917
918 if(spanLength!=0 || pos==length) {
919 // The position is before an unlimited code point span (spanLength!=0),
920 // not before a string match.
921 // The only position where spanLength==0 before a span is pos==length.
922 // Otherwise, an unlimited code point span is only tried again when no
923 // strings match, and if such a non-initial span fails we stop.
924 if(offsets.isEmpty()) {
925 return pos; // No strings matched before a span.
926 }
927 // Match strings from before the next string match.
928 } else {
929 // The position is before a string match (or a single code point).
930 if(offsets.isEmpty()) {
931 // No more strings matched before a previous string match.
932 // Try another code point span from before the last string match.
933 int32_t oldPos=pos;
934 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
935 spanLength=oldPos-pos;
936 if( pos==0 || // Reached the start of the string, or
937 spanLength==0 // neither strings nor span progressed.
938 ) {
939 return pos;
940 }
941 continue; // spanLength>0: Match strings from before a span.
942 } else {
943 // Try to match only one code point from before a string match if some
944 // string matched beyond it, so that we try all possible positions
945 // and don't overshoot.
946 spanLength=spanOneBack(spanSet, s, pos);
947 if(spanLength>0) {
948 if(spanLength==pos) {
949 return 0; // Reached the start of the string.
950 }
951 // Match strings before this code point.
952 // There cannot be any decrements below it because UnicodeSet strings
953 // contain multiple code points.
954 pos-=spanLength;
955 offsets.shift(spanLength);
956 spanLength=0;
957 continue; // Match strings from before a single code point.
958 }
959 // Match strings from before the next string match.
960 }
961 }
962 pos-=offsets.popMinimum();
963 spanLength=0; // Match strings from before a string match.
964 }
965 }
966
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const967 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
968 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
969 return spanNotUTF8(s, length);
970 }
971 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
972 if(spanLength==length) {
973 return length;
974 }
975
976 // Consider strings; they may overlap with the span.
977 OffsetList offsets;
978 if(spanCondition==USET_SPAN_CONTAINED) {
979 // Use offset list to try all possibilities.
980 offsets.setMaxLength(maxLength8);
981 }
982 int32_t pos=spanLength, rest=length-pos;
983 int32_t i, stringsLength=strings.size();
984 uint8_t *spanUTF8Lengths=spanLengths;
985 if(all) {
986 spanUTF8Lengths+=2*stringsLength;
987 }
988 for(;;) {
989 const uint8_t *s8=utf8;
990 int32_t length8;
991 if(spanCondition==USET_SPAN_CONTAINED) {
992 for(i=0; i<stringsLength; ++i) {
993 length8=utf8Lengths[i];
994 if(length8==0) {
995 continue; // String not representable in UTF-8.
996 }
997 int32_t overlap=spanUTF8Lengths[i];
998 if(overlap==ALL_CP_CONTAINED) {
999 s8+=length8;
1000 continue; // Irrelevant string.
1001 }
1002
1003 // Try to match this string at pos-overlap..pos.
1004 if(overlap>=LONG_SPAN) {
1005 overlap=length8;
1006 // While contained: No point matching fully inside the code point span.
1007 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
1008 }
1009 if(overlap>spanLength) {
1010 overlap=spanLength;
1011 }
1012 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1013 for(;;) {
1014 if(inc>rest) {
1015 break;
1016 }
1017 // Try to match if the increment is not listed already.
1018 // Match at code point boundaries. (The UTF-8 strings were converted
1019 // from UTF-16 and are guaranteed to be well-formed.)
1020 if(!U8_IS_TRAIL(s[pos-overlap]) &&
1021 !offsets.containsOffset(inc) &&
1022 matches8(s+pos-overlap, s8, length8)) {
1023 if(inc==rest) {
1024 return length; // Reached the end of the string.
1025 }
1026 offsets.addOffset(inc);
1027 }
1028 if(overlap==0) {
1029 break;
1030 }
1031 --overlap;
1032 ++inc;
1033 }
1034 s8+=length8;
1035 }
1036 } else /* USET_SPAN_SIMPLE */ {
1037 int32_t maxInc=0, maxOverlap=0;
1038 for(i=0; i<stringsLength; ++i) {
1039 length8=utf8Lengths[i];
1040 if(length8==0) {
1041 continue; // String not representable in UTF-8.
1042 }
1043 int32_t overlap=spanUTF8Lengths[i];
1044 // For longest match, we do need to try to match even an all-contained string
1045 // to find the match from the earliest start.
1046
1047 // Try to match this string at pos-overlap..pos.
1048 if(overlap>=LONG_SPAN) {
1049 overlap=length8;
1050 // Longest match: Need to match fully inside the code point span
1051 // to find the match from the earliest start.
1052 }
1053 if(overlap>spanLength) {
1054 overlap=spanLength;
1055 }
1056 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1057 for(;;) {
1058 if(inc>rest || overlap<maxOverlap) {
1059 break;
1060 }
1061 // Try to match if the string is longer or starts earlier.
1062 // Match at code point boundaries. (The UTF-8 strings were converted
1063 // from UTF-16 and are guaranteed to be well-formed.)
1064 if(!U8_IS_TRAIL(s[pos-overlap]) &&
1065 (overlap>maxOverlap ||
1066 /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1067 matches8(s+pos-overlap, s8, length8)) {
1068 maxInc=inc; // Longest match from earliest start.
1069 maxOverlap=overlap;
1070 break;
1071 }
1072 --overlap;
1073 ++inc;
1074 }
1075 s8+=length8;
1076 }
1077
1078 if(maxInc!=0 || maxOverlap!=0) {
1079 // Longest-match algorithm, and there was a string match.
1080 // Simply continue after it.
1081 pos+=maxInc;
1082 rest-=maxInc;
1083 if(rest==0) {
1084 return length; // Reached the end of the string.
1085 }
1086 spanLength=0; // Match strings from after a string match.
1087 continue;
1088 }
1089 }
1090 // Finished trying to match all strings at pos.
1091
1092 if(spanLength!=0 || pos==0) {
1093 // The position is after an unlimited code point span (spanLength!=0),
1094 // not after a string match.
1095 // The only position where spanLength==0 after a span is pos==0.
1096 // Otherwise, an unlimited code point span is only tried again when no
1097 // strings match, and if such a non-initial span fails we stop.
1098 if(offsets.isEmpty()) {
1099 return pos; // No strings matched after a span.
1100 }
1101 // Match strings from after the next string match.
1102 } else {
1103 // The position is after a string match (or a single code point).
1104 if(offsets.isEmpty()) {
1105 // No more strings matched after a previous string match.
1106 // Try another code point span from after the last string match.
1107 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1108 if( spanLength==rest || // Reached the end of the string, or
1109 spanLength==0 // neither strings nor span progressed.
1110 ) {
1111 return pos+spanLength;
1112 }
1113 pos+=spanLength;
1114 rest-=spanLength;
1115 continue; // spanLength>0: Match strings from after a span.
1116 } else {
1117 // Try to match only one code point from after a string match if some
1118 // string matched beyond it, so that we try all possible positions
1119 // and don't overshoot.
1120 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1121 if(spanLength>0) {
1122 if(spanLength==rest) {
1123 return length; // Reached the end of the string.
1124 }
1125 // Match strings after this code point.
1126 // There cannot be any increments below it because UnicodeSet strings
1127 // contain multiple code points.
1128 pos+=spanLength;
1129 rest-=spanLength;
1130 offsets.shift(spanLength);
1131 spanLength=0;
1132 continue; // Match strings from after a single code point.
1133 }
1134 // Match strings from after the next string match.
1135 }
1136 }
1137 int32_t minOffset=offsets.popMinimum();
1138 pos+=minOffset;
1139 rest-=minOffset;
1140 spanLength=0; // Match strings from after a string match.
1141 }
1142 }
1143
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const1144 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1145 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1146 return spanNotBackUTF8(s, length);
1147 }
1148 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1149 if(pos==0) {
1150 return 0;
1151 }
1152 int32_t spanLength=length-pos;
1153
1154 // Consider strings; they may overlap with the span.
1155 OffsetList offsets;
1156 if(spanCondition==USET_SPAN_CONTAINED) {
1157 // Use offset list to try all possibilities.
1158 offsets.setMaxLength(maxLength8);
1159 }
1160 int32_t i, stringsLength=strings.size();
1161 uint8_t *spanBackUTF8Lengths=spanLengths;
1162 if(all) {
1163 spanBackUTF8Lengths+=3*stringsLength;
1164 }
1165 for(;;) {
1166 const uint8_t *s8=utf8;
1167 int32_t length8;
1168 if(spanCondition==USET_SPAN_CONTAINED) {
1169 for(i=0; i<stringsLength; ++i) {
1170 length8=utf8Lengths[i];
1171 if(length8==0) {
1172 continue; // String not representable in UTF-8.
1173 }
1174 int32_t overlap=spanBackUTF8Lengths[i];
1175 if(overlap==ALL_CP_CONTAINED) {
1176 s8+=length8;
1177 continue; // Irrelevant string.
1178 }
1179
1180 // Try to match this string at pos-(length8-overlap)..pos-length8.
1181 if(overlap>=LONG_SPAN) {
1182 overlap=length8;
1183 // While contained: No point matching fully inside the code point span.
1184 int32_t len1=0;
1185 U8_FWD_1(s8, len1, overlap);
1186 overlap-=len1; // Length of the string minus the first code point.
1187 }
1188 if(overlap>spanLength) {
1189 overlap=spanLength;
1190 }
1191 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1192 for(;;) {
1193 if(dec>pos) {
1194 break;
1195 }
1196 // Try to match if the decrement is not listed already.
1197 // Match at code point boundaries. (The UTF-8 strings were converted
1198 // from UTF-16 and are guaranteed to be well-formed.)
1199 if( !U8_IS_TRAIL(s[pos-dec]) &&
1200 !offsets.containsOffset(dec) &&
1201 matches8(s+pos-dec, s8, length8)
1202 ) {
1203 if(dec==pos) {
1204 return 0; // Reached the start of the string.
1205 }
1206 offsets.addOffset(dec);
1207 }
1208 if(overlap==0) {
1209 break;
1210 }
1211 --overlap;
1212 ++dec;
1213 }
1214 s8+=length8;
1215 }
1216 } else /* USET_SPAN_SIMPLE */ {
1217 int32_t maxDec=0, maxOverlap=0;
1218 for(i=0; i<stringsLength; ++i) {
1219 length8=utf8Lengths[i];
1220 if(length8==0) {
1221 continue; // String not representable in UTF-8.
1222 }
1223 int32_t overlap=spanBackUTF8Lengths[i];
1224 // For longest match, we do need to try to match even an all-contained string
1225 // to find the match from the latest end.
1226
1227 // Try to match this string at pos-(length8-overlap)..pos-length8.
1228 if(overlap>=LONG_SPAN) {
1229 overlap=length8;
1230 // Longest match: Need to match fully inside the code point span
1231 // to find the match from the latest end.
1232 }
1233 if(overlap>spanLength) {
1234 overlap=spanLength;
1235 }
1236 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1237 for(;;) {
1238 if(dec>pos || overlap<maxOverlap) {
1239 break;
1240 }
1241 // Try to match if the string is longer or ends later.
1242 // Match at code point boundaries. (The UTF-8 strings were converted
1243 // from UTF-16 and are guaranteed to be well-formed.)
1244 if( !U8_IS_TRAIL(s[pos-dec]) &&
1245 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1246 matches8(s+pos-dec, s8, length8)
1247 ) {
1248 maxDec=dec; // Longest match from latest end.
1249 maxOverlap=overlap;
1250 break;
1251 }
1252 --overlap;
1253 ++dec;
1254 }
1255 s8+=length8;
1256 }
1257
1258 if(maxDec!=0 || maxOverlap!=0) {
1259 // Longest-match algorithm, and there was a string match.
1260 // Simply continue before it.
1261 pos-=maxDec;
1262 if(pos==0) {
1263 return 0; // Reached the start of the string.
1264 }
1265 spanLength=0; // Match strings from before a string match.
1266 continue;
1267 }
1268 }
1269 // Finished trying to match all strings at pos.
1270
1271 if(spanLength!=0 || pos==length) {
1272 // The position is before an unlimited code point span (spanLength!=0),
1273 // not before a string match.
1274 // The only position where spanLength==0 before a span is pos==length.
1275 // Otherwise, an unlimited code point span is only tried again when no
1276 // strings match, and if such a non-initial span fails we stop.
1277 if(offsets.isEmpty()) {
1278 return pos; // No strings matched before a span.
1279 }
1280 // Match strings from before the next string match.
1281 } else {
1282 // The position is before a string match (or a single code point).
1283 if(offsets.isEmpty()) {
1284 // No more strings matched before a previous string match.
1285 // Try another code point span from before the last string match.
1286 int32_t oldPos=pos;
1287 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1288 spanLength=oldPos-pos;
1289 if( pos==0 || // Reached the start of the string, or
1290 spanLength==0 // neither strings nor span progressed.
1291 ) {
1292 return pos;
1293 }
1294 continue; // spanLength>0: Match strings from before a span.
1295 } else {
1296 // Try to match only one code point from before a string match if some
1297 // string matched beyond it, so that we try all possible positions
1298 // and don't overshoot.
1299 spanLength=spanOneBackUTF8(spanSet, s, pos);
1300 if(spanLength>0) {
1301 if(spanLength==pos) {
1302 return 0; // Reached the start of the string.
1303 }
1304 // Match strings before this code point.
1305 // There cannot be any decrements below it because UnicodeSet strings
1306 // contain multiple code points.
1307 pos-=spanLength;
1308 offsets.shift(spanLength);
1309 spanLength=0;
1310 continue; // Match strings from before a single code point.
1311 }
1312 // Match strings from before the next string match.
1313 }
1314 }
1315 pos-=offsets.popMinimum();
1316 spanLength=0; // Match strings from before a string match.
1317 }
1318 }
1319
1320 /*
1321 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1322 *
1323 * Theoretical algorithm:
1324 * - Iterate through the string, and at each code point boundary:
1325 * + If the code point there is in the set, then return with the current position.
1326 * + If a set string matches at the current position, then return with the current position.
1327 *
1328 * Optimized implementation:
1329 *
1330 * (Same assumption as for span() above.)
1331 *
1332 * Create and cache a spanNotSet which contains all of the single code points
1333 * of the original set but none of its strings.
1334 * For each set string add its initial code point to the spanNotSet.
1335 * (Also add its final code point for spanNotBack().)
1336 *
1337 * - Loop:
1338 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1339 * + If the current code point is in the original set, then
1340 * return the current position.
1341 * + If any set string matches at the current position, then
1342 * return the current position.
1343 * + If there is no match at the current position, neither for the code point there
1344 * nor for any set string, then skip this code point and continue the loop.
1345 * This happens for set-string-initial code points that were added to spanNotSet
1346 * when there is not actually a match for such a set string.
1347 */
1348
spanNot(const UChar * s,int32_t length) const1349 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1350 int32_t pos=0, rest=length;
1351 int32_t i, stringsLength=strings.size();
1352 do {
1353 // Span until we find a code point from the set,
1354 // or a code point that starts or ends some string.
1355 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1356 if(i==rest) {
1357 return length; // Reached the end of the string.
1358 }
1359 pos+=i;
1360 rest-=i;
1361
1362 // Check whether the current code point is in the original set,
1363 // without the string starts and ends.
1364 int32_t cpLength=spanOne(spanSet, s+pos, rest);
1365 if(cpLength>0) {
1366 return pos; // There is a set element at pos.
1367 }
1368
1369 // Try to match the strings at pos.
1370 for(i=0; i<stringsLength; ++i) {
1371 if(spanLengths[i]==ALL_CP_CONTAINED) {
1372 continue; // Irrelevant string. (Also the empty string.)
1373 }
1374 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1375 const UChar *s16=string.getBuffer();
1376 int32_t length16=string.length();
1377 U_ASSERT(length>0);
1378 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1379 return pos; // There is a set element at pos.
1380 }
1381 }
1382
1383 // The span(while not contained) ended on a string start/end which is
1384 // not in the original set. Skip this code point and continue.
1385 // cpLength<0
1386 pos-=cpLength;
1387 rest+=cpLength;
1388 } while(rest!=0);
1389 return length; // Reached the end of the string.
1390 }
1391
spanNotBack(const UChar * s,int32_t length) const1392 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1393 int32_t pos=length;
1394 int32_t i, stringsLength=strings.size();
1395 do {
1396 // Span until we find a code point from the set,
1397 // or a code point that starts or ends some string.
1398 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1399 if(pos==0) {
1400 return 0; // Reached the start of the string.
1401 }
1402
1403 // Check whether the current code point is in the original set,
1404 // without the string starts and ends.
1405 int32_t cpLength=spanOneBack(spanSet, s, pos);
1406 if(cpLength>0) {
1407 return pos; // There is a set element at pos.
1408 }
1409
1410 // Try to match the strings at pos.
1411 for(i=0; i<stringsLength; ++i) {
1412 // Use spanLengths rather than a spanBackLengths pointer because
1413 // it is easier and we only need to know whether the string is irrelevant
1414 // which is the same in either array.
1415 if(spanLengths[i]==ALL_CP_CONTAINED) {
1416 continue; // Irrelevant string. (Also the empty string.)
1417 }
1418 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1419 const UChar *s16=string.getBuffer();
1420 int32_t length16=string.length();
1421 U_ASSERT(length>0);
1422 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1423 return pos; // There is a set element at pos.
1424 }
1425 }
1426
1427 // The span(while not contained) ended on a string start/end which is
1428 // not in the original set. Skip this code point and continue.
1429 // cpLength<0
1430 pos+=cpLength;
1431 } while(pos!=0);
1432 return 0; // Reached the start of the string.
1433 }
1434
spanNotUTF8(const uint8_t * s,int32_t length) const1435 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1436 int32_t pos=0, rest=length;
1437 int32_t i, stringsLength=strings.size();
1438 uint8_t *spanUTF8Lengths=spanLengths;
1439 if(all) {
1440 spanUTF8Lengths+=2*stringsLength;
1441 }
1442 do {
1443 // Span until we find a code point from the set,
1444 // or a code point that starts or ends some string.
1445 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1446 if(i==rest) {
1447 return length; // Reached the end of the string.
1448 }
1449 pos+=i;
1450 rest-=i;
1451
1452 // Check whether the current code point is in the original set,
1453 // without the string starts and ends.
1454 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1455 if(cpLength>0) {
1456 return pos; // There is a set element at pos.
1457 }
1458
1459 // Try to match the strings at pos.
1460 const uint8_t *s8=utf8;
1461 int32_t length8;
1462 for(i=0; i<stringsLength; ++i) {
1463 length8=utf8Lengths[i];
1464 // ALL_CP_CONTAINED: Irrelevant string.
1465 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1466 return pos; // There is a set element at pos.
1467 }
1468 s8+=length8;
1469 }
1470
1471 // The span(while not contained) ended on a string start/end which is
1472 // not in the original set. Skip this code point and continue.
1473 // cpLength<0
1474 pos-=cpLength;
1475 rest+=cpLength;
1476 } while(rest!=0);
1477 return length; // Reached the end of the string.
1478 }
1479
spanNotBackUTF8(const uint8_t * s,int32_t length) const1480 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1481 int32_t pos=length;
1482 int32_t i, stringsLength=strings.size();
1483 uint8_t *spanBackUTF8Lengths=spanLengths;
1484 if(all) {
1485 spanBackUTF8Lengths+=3*stringsLength;
1486 }
1487 do {
1488 // Span until we find a code point from the set,
1489 // or a code point that starts or ends some string.
1490 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1491 if(pos==0) {
1492 return 0; // Reached the start of the string.
1493 }
1494
1495 // Check whether the current code point is in the original set,
1496 // without the string starts and ends.
1497 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1498 if(cpLength>0) {
1499 return pos; // There is a set element at pos.
1500 }
1501
1502 // Try to match the strings at pos.
1503 const uint8_t *s8=utf8;
1504 int32_t length8;
1505 for(i=0; i<stringsLength; ++i) {
1506 length8=utf8Lengths[i];
1507 // ALL_CP_CONTAINED: Irrelevant string.
1508 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1509 return pos; // There is a set element at pos.
1510 }
1511 s8+=length8;
1512 }
1513
1514 // The span(while not contained) ended on a string start/end which is
1515 // not in the original set. Skip this code point and continue.
1516 // cpLength<0
1517 pos+=cpLength;
1518 } while(pos!=0);
1519 return 0; // Reached the start of the string.
1520 }
1521
1522 U_NAMESPACE_END
1523