1 /*
2 ******************************************************************************
3 *
4 * Copyright (C) 2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 ******************************************************************************
8 * file name: unisetspan.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2007mar01
14 * created by: Markus W. Scherer
15 */
16
17 #include "unicode/utypes.h"
18 #include "unicode/uniset.h"
19 #include "unicode/ustring.h"
20 #include "cmemory.h"
21 #include "uvector.h"
22 #include "unisetspan.h"
23
24 U_NAMESPACE_BEGIN
25
26 /*
27 * List of offsets from the current position from where to try matching
28 * a code point or a string.
29 * Store offsets rather than indexes to simplify the code and use the same list
30 * for both increments (in span()) and decrements (in spanBack()).
31 *
32 * Assumption: The maximum offset is limited, and the offsets that are stored
33 * at any one time are relatively dense, that is, there are normally no gaps of
34 * hundreds or thousands of offset values.
35 *
36 * The implementation uses a circular buffer of byte flags,
37 * each indicating whether the corresponding offset is in the list.
38 * This avoids inserting into a sorted list of offsets (or absolute indexes) and
39 * physically moving part of the list.
40 *
41 * Note: In principle, the caller should setMaxLength() to the maximum of the
42 * max string length and U16_LENGTH/U8_LENGTH to account for
43 * "long" single code points.
44 * However, this implementation uses at least a staticList with more than
45 * U8_LENGTH entries anyway.
46 *
47 * Note: If maxLength were guaranteed to be no more than 32 or 64,
48 * the list could be stored as bit flags in a single integer.
49 * Rather than handling a circular buffer with a start list index,
50 * the integer would simply be shifted when lower offsets are removed.
51 * UnicodeSet does not have a limit on the lengths of strings.
52 */
53 class OffsetList { // Only ever stack-allocated, does not need to inherit UMemory.
54 public:
OffsetList()55 OffsetList() : list(staticList), capacity(0), length(0), start(0) {}
56
~OffsetList()57 ~OffsetList() {
58 if(list!=staticList) {
59 uprv_free(list);
60 }
61 }
62
63 // Call exactly once if the list is to be used.
setMaxLength(int32_t maxLength)64 void setMaxLength(int32_t maxLength) {
65 if(maxLength<=(int32_t)sizeof(staticList)) {
66 capacity=(int32_t)sizeof(staticList);
67 } else {
68 UBool *l=(UBool *)uprv_malloc(maxLength);
69 if(l!=NULL) {
70 list=l;
71 capacity=maxLength;
72 }
73 }
74 uprv_memset(list, 0, capacity);
75 }
76
clear()77 void clear() {
78 uprv_memset(list, 0, capacity);
79 start=length=0;
80 }
81
isEmpty() const82 UBool isEmpty() const {
83 return (UBool)(length==0);
84 }
85
86 // Reduce all stored offsets by delta, used when the current position
87 // moves by delta.
88 // There must not be any offsets lower than delta.
89 // If there is an offset equal to delta, it is removed.
90 // delta=[1..maxLength]
shift(int32_t delta)91 void shift(int32_t delta) {
92 int32_t i=start+delta;
93 if(i>=capacity) {
94 i-=capacity;
95 }
96 if(list[i]) {
97 list[i]=FALSE;
98 --length;
99 }
100 start=i;
101 }
102
103 // Add an offset. The list must not contain it yet.
104 // offset=[1..maxLength]
addOffset(int32_t offset)105 void addOffset(int32_t offset) {
106 int32_t i=start+offset;
107 if(i>=capacity) {
108 i-=capacity;
109 }
110 list[i]=TRUE;
111 ++length;
112 }
113
114 // offset=[1..maxLength]
containsOffset(int32_t offset) const115 UBool containsOffset(int32_t offset) const {
116 int32_t i=start+offset;
117 if(i>=capacity) {
118 i-=capacity;
119 }
120 return list[i];
121 }
122
123 // Find the lowest stored offset from a non-empty list, remove it,
124 // and reduce all other offsets by this minimum.
125 // Returns [1..maxLength].
popMinimum()126 int32_t popMinimum() {
127 // Look for the next offset in list[start+1..capacity-1].
128 int32_t i=start, result;
129 while(++i<capacity) {
130 if(list[i]) {
131 list[i]=FALSE;
132 --length;
133 result=i-start;
134 start=i;
135 return result;
136 }
137 }
138 // i==capacity
139
140 // Wrap around and look for the next offset in list[0..start].
141 // Since the list is not empty, there will be one.
142 result=capacity-start;
143 i=0;
144 while(!list[i]) {
145 ++i;
146 }
147 list[i]=FALSE;
148 --length;
149 start=i;
150 return result+=i;
151 }
152
153 private:
154 UBool *list;
155 int32_t capacity;
156 int32_t length;
157 int32_t start;
158
159 UBool staticList[16];
160 };
161
162 // Get the number of UTF-8 bytes for a UTF-16 (sub)string.
163 static int32_t
getUTF8Length(const UChar * s,int32_t length)164 getUTF8Length(const UChar *s, int32_t length) {
165 UErrorCode errorCode=U_ZERO_ERROR;
166 int32_t length8=0;
167 u_strToUTF8(NULL, 0, &length8, s, length, &errorCode);
168 if(U_SUCCESS(errorCode) || errorCode==U_BUFFER_OVERFLOW_ERROR) {
169 return length8;
170 } else {
171 // The string contains an unpaired surrogate.
172 // Ignore this string.
173 return 0;
174 }
175 }
176
177 // Append the UTF-8 version of the string to t and return the appended UTF-8 length.
178 static int32_t
appendUTF8(const UChar * s,int32_t length,uint8_t * t,int32_t capacity)179 appendUTF8(const UChar *s, int32_t length, uint8_t *t, int32_t capacity) {
180 UErrorCode errorCode=U_ZERO_ERROR;
181 int32_t length8=0;
182 u_strToUTF8((char *)t, capacity, &length8, s, length, &errorCode);
183 if(U_SUCCESS(errorCode)) {
184 return length8;
185 } else {
186 // The string contains an unpaired surrogate.
187 // Ignore this string.
188 return 0;
189 }
190 }
191
192 static inline uint8_t
makeSpanLengthByte(int32_t spanLength)193 makeSpanLengthByte(int32_t spanLength) {
194 // 0xfe==UnicodeSetStringSpan::LONG_SPAN
195 return spanLength<0xfe ? (uint8_t)spanLength : (uint8_t)0xfe;
196 }
197
198 // Construct for all variants of span(), or only for any one variant.
199 // Initialize as little as possible, for single use.
UnicodeSetStringSpan(const UnicodeSet & set,const UVector & setStrings,uint32_t which)200 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSet &set,
201 const UVector &setStrings,
202 uint32_t which)
203 : spanSet(0, 0x10ffff), pSpanNotSet(NULL), strings(setStrings),
204 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
205 utf8Length(0),
206 maxLength16(0), maxLength8(0),
207 all((UBool)(which==ALL)) {
208 spanSet.retainAll(set);
209 if(which&NOT_CONTAINED) {
210 // Default to the same sets.
211 // addToSpanNotSet() will create a separate set if necessary.
212 pSpanNotSet=&spanSet;
213 }
214
215 // Determine if the strings even need to be taken into account at all for span() etc.
216 // If any string is relevant, then all strings need to be used for
217 // span(longest match) but only the relevant ones for span(while contained).
218 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH
219 // and do not store UTF-8 strings if !thisRelevant and CONTAINED.
220 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.)
221 // Also count the lengths of the UTF-8 versions of the strings for memory allocation.
222 int32_t stringsLength=strings.size();
223
224 int32_t i, spanLength;
225 UBool someRelevant=FALSE;
226 for(i=0; i<stringsLength; ++i) {
227 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
228 const UChar *s16=string.getBuffer();
229 int32_t length16=string.length();
230 UBool thisRelevant;
231 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
232 if(spanLength<length16) { // Relevant string.
233 someRelevant=thisRelevant=TRUE;
234 } else {
235 thisRelevant=FALSE;
236 }
237 if((which&UTF16) && length16>maxLength16) {
238 maxLength16=length16;
239 }
240 if((which&UTF8) && (thisRelevant || (which&CONTAINED))) {
241 int32_t length8=getUTF8Length(s16, length16);
242 utf8Length+=length8;
243 if(length8>maxLength8) {
244 maxLength8=length8;
245 }
246 }
247 }
248 if(!someRelevant) {
249 maxLength16=maxLength8=0;
250 return;
251 }
252
253 // Freeze after checking for the need to use strings at all because freezing
254 // a set takes some time and memory which are wasted if there are no relevant strings.
255 if(all) {
256 spanSet.freeze();
257 }
258
259 uint8_t *spanBackLengths;
260 uint8_t *spanUTF8Lengths;
261 uint8_t *spanBackUTF8Lengths;
262
263 // Allocate a block of meta data.
264 int32_t allocSize;
265 if(all) {
266 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
267 allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
268 } else {
269 allocSize=stringsLength; // One set of span lengths.
270 if(which&UTF8) {
271 // UTF-8 lengths and UTF-8 strings.
272 allocSize+=stringsLength*4+utf8Length;
273 }
274 }
275 if(allocSize<=(int32_t)sizeof(staticLengths)) {
276 utf8Lengths=staticLengths;
277 } else {
278 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
279 if(utf8Lengths==NULL) {
280 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
281 return; // Out of memory.
282 }
283 }
284
285 if(all) {
286 // Store span lengths for all span() variants.
287 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
288 spanBackLengths=spanLengths+stringsLength;
289 spanUTF8Lengths=spanBackLengths+stringsLength;
290 spanBackUTF8Lengths=spanUTF8Lengths+stringsLength;
291 utf8=spanBackUTF8Lengths+stringsLength;
292 } else {
293 // Store span lengths for only one span() variant.
294 if(which&UTF8) {
295 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
296 utf8=spanLengths+stringsLength;
297 } else {
298 spanLengths=(uint8_t *)utf8Lengths;
299 }
300 spanBackLengths=spanUTF8Lengths=spanBackUTF8Lengths=spanLengths;
301 }
302
303 // Set the meta data and pSpanNotSet and write the UTF-8 strings.
304 int32_t utf8Count=0; // Count UTF-8 bytes written so far.
305
306 for(i=0; i<stringsLength; ++i) {
307 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
308 const UChar *s16=string.getBuffer();
309 int32_t length16=string.length();
310 spanLength=spanSet.span(s16, length16, USET_SPAN_CONTAINED);
311 if(spanLength<length16) { // Relevant string.
312 if(which&UTF16) {
313 if(which&CONTAINED) {
314 if(which&FWD) {
315 spanLengths[i]=makeSpanLengthByte(spanLength);
316 }
317 if(which&BACK) {
318 spanLength=length16-spanSet.spanBack(s16, length16, USET_SPAN_CONTAINED);
319 spanBackLengths[i]=makeSpanLengthByte(spanLength);
320 }
321 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
322 spanLengths[i]=spanBackLengths[i]=0; // Only store a relevant/irrelevant flag.
323 }
324 }
325 if(which&UTF8) {
326 uint8_t *s8=utf8+utf8Count;
327 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
328 utf8Count+=utf8Lengths[i]=length8;
329 if(length8==0) { // Irrelevant for UTF-8 because not representable in UTF-8.
330 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=(uint8_t)ALL_CP_CONTAINED;
331 } else { // Relevant for UTF-8.
332 if(which&CONTAINED) {
333 if(which&FWD) {
334 spanLength=spanSet.spanUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
335 spanUTF8Lengths[i]=makeSpanLengthByte(spanLength);
336 }
337 if(which&BACK) {
338 spanLength=length8-spanSet.spanBackUTF8((const char *)s8, length8, USET_SPAN_CONTAINED);
339 spanBackUTF8Lengths[i]=makeSpanLengthByte(spanLength);
340 }
341 } else /* not CONTAINED, not all, but NOT_CONTAINED */ {
342 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=0; // Only store a relevant/irrelevant flag.
343 }
344 }
345 }
346 if(which&NOT_CONTAINED) {
347 // Add string start and end code points to the spanNotSet so that
348 // a span(while not contained) stops before any string.
349 UChar32 c;
350 if(which&FWD) {
351 int32_t len=0;
352 U16_NEXT(s16, len, length16, c);
353 addToSpanNotSet(c);
354 }
355 if(which&BACK) {
356 int32_t len=length16;
357 U16_PREV(s16, 0, len, c);
358 addToSpanNotSet(c);
359 }
360 }
361 } else { // Irrelevant string.
362 if(which&UTF8) {
363 if(which&CONTAINED) { // Only necessary for LONGEST_MATCH.
364 uint8_t *s8=utf8+utf8Count;
365 int32_t length8=appendUTF8(s16, length16, s8, utf8Length-utf8Count);
366 utf8Count+=utf8Lengths[i]=length8;
367 } else {
368 utf8Lengths[i]=0;
369 }
370 }
371 if(all) {
372 spanLengths[i]=spanBackLengths[i]=
373 spanUTF8Lengths[i]=spanBackUTF8Lengths[i]=
374 (uint8_t)ALL_CP_CONTAINED;
375 } else {
376 // All spanXYZLengths pointers contain the same address.
377 spanLengths[i]=(uint8_t)ALL_CP_CONTAINED;
378 }
379 }
380 }
381
382 // Finish.
383 if(all) {
384 pSpanNotSet->freeze();
385 }
386 }
387
388 // Copy constructor. Assumes which==ALL for a frozen set.
UnicodeSetStringSpan(const UnicodeSetStringSpan & otherStringSpan,const UVector & newParentSetStrings)389 UnicodeSetStringSpan::UnicodeSetStringSpan(const UnicodeSetStringSpan &otherStringSpan,
390 const UVector &newParentSetStrings)
391 : spanSet(otherStringSpan.spanSet), pSpanNotSet(NULL), strings(newParentSetStrings),
392 utf8Lengths(NULL), spanLengths(NULL), utf8(NULL),
393 utf8Length(otherStringSpan.utf8Length),
394 maxLength16(otherStringSpan.maxLength16), maxLength8(otherStringSpan.maxLength8),
395 all(TRUE) {
396 if(otherStringSpan.pSpanNotSet==&otherStringSpan.spanSet) {
397 pSpanNotSet=&spanSet;
398 } else {
399 pSpanNotSet=(UnicodeSet *)otherStringSpan.pSpanNotSet->clone();
400 }
401
402 // Allocate a block of meta data.
403 // UTF-8 lengths, 4 sets of span lengths, UTF-8 strings.
404 int32_t stringsLength=strings.size();
405 int32_t allocSize=stringsLength*(4+1+1+1+1)+utf8Length;
406 if(allocSize<=(int32_t)sizeof(staticLengths)) {
407 utf8Lengths=staticLengths;
408 } else {
409 utf8Lengths=(int32_t *)uprv_malloc(allocSize);
410 if(utf8Lengths==NULL) {
411 maxLength16=maxLength8=0; // Prevent usage by making needsStringSpanUTF16/8() return FALSE.
412 return; // Out of memory.
413 }
414 }
415
416 spanLengths=(uint8_t *)(utf8Lengths+stringsLength);
417 utf8=spanLengths+stringsLength*4;
418 uprv_memcpy(utf8Lengths, otherStringSpan.utf8Lengths, allocSize);
419 }
420
~UnicodeSetStringSpan()421 UnicodeSetStringSpan::~UnicodeSetStringSpan() {
422 if(pSpanNotSet!=NULL && pSpanNotSet!=&spanSet) {
423 delete pSpanNotSet;
424 }
425 if(utf8Lengths!=NULL && utf8Lengths!=staticLengths) {
426 uprv_free(utf8Lengths);
427 }
428 }
429
addToSpanNotSet(UChar32 c)430 void UnicodeSetStringSpan::addToSpanNotSet(UChar32 c) {
431 if(pSpanNotSet==NULL || pSpanNotSet==&spanSet) {
432 if(spanSet.contains(c)) {
433 return; // Nothing to do.
434 }
435 UnicodeSet *newSet=(UnicodeSet *)spanSet.cloneAsThawed();
436 if(newSet==NULL) {
437 return; // Out of memory.
438 } else {
439 pSpanNotSet=newSet;
440 }
441 }
442 pSpanNotSet->add(c);
443 }
444
445 // Compare strings without any argument checks. Requires length>0.
446 static inline UBool
matches16(const UChar * s,const UChar * t,int32_t length)447 matches16(const UChar *s, const UChar *t, int32_t length) {
448 do {
449 if(*s++!=*t++) {
450 return FALSE;
451 }
452 } while(--length>0);
453 return TRUE;
454 }
455
456 static inline UBool
matches8(const uint8_t * s,const uint8_t * t,int32_t length)457 matches8(const uint8_t *s, const uint8_t *t, int32_t length) {
458 do {
459 if(*s++!=*t++) {
460 return FALSE;
461 }
462 } while(--length>0);
463 return TRUE;
464 }
465
466 // Compare 16-bit Unicode strings (which may be malformed UTF-16)
467 // at code point boundaries.
468 // That is, each edge of a match must not be in the middle of a surrogate pair.
469 static inline UBool
matches16CPB(const UChar * s,int32_t start,int32_t limit,const UChar * t,int32_t length)470 matches16CPB(const UChar *s, int32_t start, int32_t limit, const UChar *t, int32_t length) {
471 s+=start;
472 limit-=start;
473 return matches16(s, t, length) &&
474 !(0<start && U16_IS_LEAD(s[-1]) && U16_IS_TRAIL(s[0])) &&
475 !(length<limit && U16_IS_LEAD(s[length-1]) && U16_IS_TRAIL(s[length]));
476 }
477
478 // Does the set contain the next code point?
479 // If so, return its length; otherwise return its negative length.
480 static inline int32_t
spanOne(const UnicodeSet & set,const UChar * s,int32_t length)481 spanOne(const UnicodeSet &set, const UChar *s, int32_t length) {
482 UChar c=*s, c2;
483 if(c>=0xd800 && c<=0xdbff && length>=2 && U16_IS_TRAIL(c2=s[1])) {
484 return set.contains(U16_GET_SUPPLEMENTARY(c, c2)) ? 2 : -2;
485 }
486 return set.contains(c) ? 1 : -1;
487 }
488
489 static inline int32_t
spanOneBack(const UnicodeSet & set,const UChar * s,int32_t length)490 spanOneBack(const UnicodeSet &set, const UChar *s, int32_t length) {
491 UChar c=s[length-1], c2;
492 if(c>=0xdc00 && c<=0xdfff && length>=2 && U16_IS_LEAD(c2=s[length-2])) {
493 return set.contains(U16_GET_SUPPLEMENTARY(c2, c)) ? 2 : -2;
494 }
495 return set.contains(c) ? 1 : -1;
496 }
497
498 static inline int32_t
spanOneUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)499 spanOneUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
500 UChar32 c=*s;
501 if((int8_t)c>=0) {
502 return set.contains(c) ? 1 : -1;
503 }
504 // Take advantage of non-ASCII fastpaths in U8_NEXT().
505 int32_t i=0;
506 U8_NEXT(s, i, length, c);
507 return set.contains(c) ? i : -i;
508 }
509
510 static inline int32_t
spanOneBackUTF8(const UnicodeSet & set,const uint8_t * s,int32_t length)511 spanOneBackUTF8(const UnicodeSet &set, const uint8_t *s, int32_t length) {
512 UChar32 c=s[length-1];
513 if((int8_t)c>=0) {
514 return set.contains(c) ? 1 : -1;
515 }
516 int32_t i=length-1;
517 c=utf8_prevCharSafeBody(s, 0, &i, c, -1);
518 length-=i;
519 return set.contains(c) ? length : -length;
520 }
521
522 /*
523 * Note: In span() when spanLength==0 (after a string match, or at the beginning
524 * after an empty code point span) and in spanNot() and spanNotUTF8(),
525 * string matching could use a binary search
526 * because all string matches are done from the same start index.
527 *
528 * For UTF-8, this would require a comparison function that returns UTF-16 order.
529 *
530 * This optimization should not be necessary for normal UnicodeSets because
531 * most sets have no strings, and most sets with strings have
532 * very few very short strings.
533 * For cases with many strings, it might be better to use a different API
534 * and implementation with a DFA (state machine).
535 */
536
537 /*
538 * Algorithm for span(USET_SPAN_CONTAINED)
539 *
540 * Theoretical algorithm:
541 * - Iterate through the string, and at each code point boundary:
542 * + If the code point there is in the set, then remember to continue after it.
543 * + If a set string matches at the current position, then remember to continue after it.
544 * + Either recursively span for each code point or string match,
545 * or recursively span for all but the shortest one and
546 * iteratively continue the span with the shortest local match.
547 * + Remember the longest recursive span (the farthest end point).
548 * + If there is no match at the current position, neither for the code point there
549 * nor for any set string, then stop and return the longest recursive span length.
550 *
551 * Optimized implementation:
552 *
553 * (We assume that most sets will have very few very short strings.
554 * A span using a string-less set is extremely fast.)
555 *
556 * Create and cache a spanSet which contains all of the single code points
557 * of the original set but none of its strings.
558 *
559 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
560 * - Loop:
561 * + Try to match each set string at the end of the spanLength.
562 * ~ Set strings that start with set-contained code points must be matched
563 * with a partial overlap because the recursive algorithm would have tried
564 * to match them at every position.
565 * ~ Set strings that entirely consist of set-contained code points
566 * are irrelevant for span(USET_SPAN_CONTAINED) because the
567 * recursive algorithm would continue after them anyway
568 * and find the longest recursive match from their end.
569 * ~ Rather than recursing, note each end point of a set string match.
570 * + If no set string matched after spanSet.span(), then return
571 * with where the spanSet.span() ended.
572 * + If at least one set string matched after spanSet.span(), then
573 * pop the shortest string match end point and continue
574 * the loop, trying to match all set strings from there.
575 * + If at least one more set string matched after a previous string match,
576 * then test if the code point after the previous string match is also
577 * contained in the set.
578 * Continue the loop with the shortest end point of either this code point
579 * or a matching set string.
580 * + If no more set string matched after a previous string match,
581 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
582 * Stop if spanLength==0, otherwise continue the loop.
583 *
584 * By noting each end point of a set string match,
585 * the function visits each string position at most once and finishes
586 * in linear time.
587 *
588 * The recursive algorithm may visit the same string position many times
589 * if multiple paths lead to it and finishes in exponential time.
590 */
591
592 /*
593 * Algorithm for span(USET_SPAN_SIMPLE)
594 *
595 * Theoretical algorithm:
596 * - Iterate through the string, and at each code point boundary:
597 * + If the code point there is in the set, then remember to continue after it.
598 * + If a set string matches at the current position, then remember to continue after it.
599 * + Continue from the farthest match position and ignore all others.
600 * + If there is no match at the current position,
601 * then stop and return the current position.
602 *
603 * Optimized implementation:
604 *
605 * (Same assumption and spanSet as above.)
606 *
607 * - Start with spanLength=spanSet.span(USET_SPAN_CONTAINED).
608 * - Loop:
609 * + Try to match each set string at the end of the spanLength.
610 * ~ Set strings that start with set-contained code points must be matched
611 * with a partial overlap because the standard algorithm would have tried
612 * to match them earlier.
613 * ~ Set strings that entirely consist of set-contained code points
614 * must be matched with a full overlap because the longest-match algorithm
615 * would hide set string matches that end earlier.
616 * Such set strings need not be matched earlier inside the code point span
617 * because the standard algorithm would then have continued after
618 * the set string match anyway.
619 * ~ Remember the longest set string match (farthest end point) from the earliest
620 * starting point.
621 * + If no set string matched after spanSet.span(), then return
622 * with where the spanSet.span() ended.
623 * + If at least one set string matched, then continue the loop after the
624 * longest match from the earliest position.
625 * + If no more set string matched after a previous string match,
626 * then try another spanLength=spanSet.span(USET_SPAN_CONTAINED).
627 * Stop if spanLength==0, otherwise continue the loop.
628 */
629
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const630 int32_t UnicodeSetStringSpan::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
631 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
632 return spanNot(s, length);
633 }
634 int32_t spanLength=spanSet.span(s, length, USET_SPAN_CONTAINED);
635 if(spanLength==length) {
636 return length;
637 }
638
639 // Consider strings; they may overlap with the span.
640 OffsetList offsets;
641 if(spanCondition==USET_SPAN_CONTAINED) {
642 // Use offset list to try all possibilities.
643 offsets.setMaxLength(maxLength16);
644 }
645 int32_t pos=spanLength, rest=length-pos;
646 int32_t i, stringsLength=strings.size();
647 for(;;) {
648 if(spanCondition==USET_SPAN_CONTAINED) {
649 for(i=0; i<stringsLength; ++i) {
650 int32_t overlap=spanLengths[i];
651 if(overlap==ALL_CP_CONTAINED) {
652 continue; // Irrelevant string.
653 }
654 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
655 const UChar *s16=string.getBuffer();
656 int32_t length16=string.length();
657
658 // Try to match this string at pos-overlap..pos.
659 if(overlap>=LONG_SPAN) {
660 overlap=length16;
661 // While contained: No point matching fully inside the code point span.
662 U16_BACK_1(s16, 0, overlap); // Length of the string minus the last code point.
663 }
664 if(overlap>spanLength) {
665 overlap=spanLength;
666 }
667 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
668 for(;;) {
669 if(inc>rest) {
670 break;
671 }
672 // Try to match if the increment is not listed already.
673 if(!offsets.containsOffset(inc) && matches16CPB(s, pos-overlap, length, s16, length16)) {
674 if(inc==rest) {
675 return length; // Reached the end of the string.
676 }
677 offsets.addOffset(inc);
678 }
679 if(overlap==0) {
680 break;
681 }
682 --overlap;
683 ++inc;
684 }
685 }
686 } else /* USET_SPAN_SIMPLE */ {
687 int32_t maxInc=0, maxOverlap=0;
688 for(i=0; i<stringsLength; ++i) {
689 int32_t overlap=spanLengths[i];
690 // For longest match, we do need to try to match even an all-contained string
691 // to find the match from the earliest start.
692
693 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
694 const UChar *s16=string.getBuffer();
695 int32_t length16=string.length();
696
697 // Try to match this string at pos-overlap..pos.
698 if(overlap>=LONG_SPAN) {
699 overlap=length16;
700 // Longest match: Need to match fully inside the code point span
701 // to find the match from the earliest start.
702 }
703 if(overlap>spanLength) {
704 overlap=spanLength;
705 }
706 int32_t inc=length16-overlap; // Keep overlap+inc==length16.
707 for(;;) {
708 if(inc>rest || overlap<maxOverlap) {
709 break;
710 }
711 // Try to match if the string is longer or starts earlier.
712 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
713 matches16CPB(s, pos-overlap, length, s16, length16)
714 ) {
715 maxInc=inc; // Longest match from earliest start.
716 maxOverlap=overlap;
717 break;
718 }
719 --overlap;
720 ++inc;
721 }
722 }
723
724 if(maxInc!=0 || maxOverlap!=0) {
725 // Longest-match algorithm, and there was a string match.
726 // Simply continue after it.
727 pos+=maxInc;
728 rest-=maxInc;
729 if(rest==0) {
730 return length; // Reached the end of the string.
731 }
732 spanLength=0; // Match strings from after a string match.
733 continue;
734 }
735 }
736 // Finished trying to match all strings at pos.
737
738 if(spanLength!=0 || pos==0) {
739 // The position is after an unlimited code point span (spanLength!=0),
740 // not after a string match.
741 // The only position where spanLength==0 after a span is pos==0.
742 // Otherwise, an unlimited code point span is only tried again when no
743 // strings match, and if such a non-initial span fails we stop.
744 if(offsets.isEmpty()) {
745 return pos; // No strings matched after a span.
746 }
747 // Match strings from after the next string match.
748 } else {
749 // The position is after a string match (or a single code point).
750 if(offsets.isEmpty()) {
751 // No more strings matched after a previous string match.
752 // Try another code point span from after the last string match.
753 spanLength=spanSet.span(s+pos, rest, USET_SPAN_CONTAINED);
754 if( spanLength==rest || // Reached the end of the string, or
755 spanLength==0 // neither strings nor span progressed.
756 ) {
757 return pos+spanLength;
758 }
759 pos+=spanLength;
760 rest-=spanLength;
761 continue; // spanLength>0: Match strings from after a span.
762 } else {
763 // Try to match only one code point from after a string match if some
764 // string matched beyond it, so that we try all possible positions
765 // and don't overshoot.
766 spanLength=spanOne(spanSet, s+pos, rest);
767 if(spanLength>0) {
768 if(spanLength==rest) {
769 return length; // Reached the end of the string.
770 }
771 // Match strings after this code point.
772 // There cannot be any increments below it because UnicodeSet strings
773 // contain multiple code points.
774 pos+=spanLength;
775 rest-=spanLength;
776 offsets.shift(spanLength);
777 spanLength=0;
778 continue; // Match strings from after a single code point.
779 }
780 // Match strings from after the next string match.
781 }
782 }
783 int32_t minOffset=offsets.popMinimum();
784 pos+=minOffset;
785 rest-=minOffset;
786 spanLength=0; // Match strings from after a string match.
787 }
788 }
789
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const790 int32_t UnicodeSetStringSpan::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
791 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
792 return spanNotBack(s, length);
793 }
794 int32_t pos=spanSet.spanBack(s, length, USET_SPAN_CONTAINED);
795 if(pos==0) {
796 return 0;
797 }
798 int32_t spanLength=length-pos;
799
800 // Consider strings; they may overlap with the span.
801 OffsetList offsets;
802 if(spanCondition==USET_SPAN_CONTAINED) {
803 // Use offset list to try all possibilities.
804 offsets.setMaxLength(maxLength16);
805 }
806 int32_t i, stringsLength=strings.size();
807 uint8_t *spanBackLengths=spanLengths;
808 if(all) {
809 spanBackLengths+=stringsLength;
810 }
811 for(;;) {
812 if(spanCondition==USET_SPAN_CONTAINED) {
813 for(i=0; i<stringsLength; ++i) {
814 int32_t overlap=spanBackLengths[i];
815 if(overlap==ALL_CP_CONTAINED) {
816 continue; // Irrelevant string.
817 }
818 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
819 const UChar *s16=string.getBuffer();
820 int32_t length16=string.length();
821
822 // Try to match this string at pos-(length16-overlap)..pos-length16.
823 if(overlap>=LONG_SPAN) {
824 overlap=length16;
825 // While contained: No point matching fully inside the code point span.
826 int32_t len1=0;
827 U16_FWD_1(s16, len1, overlap);
828 overlap-=len1; // Length of the string minus the first code point.
829 }
830 if(overlap>spanLength) {
831 overlap=spanLength;
832 }
833 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
834 for(;;) {
835 if(dec>pos) {
836 break;
837 }
838 // Try to match if the decrement is not listed already.
839 if(!offsets.containsOffset(dec) && matches16CPB(s, pos-dec, length, s16, length16)) {
840 if(dec==pos) {
841 return 0; // Reached the start of the string.
842 }
843 offsets.addOffset(dec);
844 }
845 if(overlap==0) {
846 break;
847 }
848 --overlap;
849 ++dec;
850 }
851 }
852 } else /* USET_SPAN_SIMPLE */ {
853 int32_t maxDec=0, maxOverlap=0;
854 for(i=0; i<stringsLength; ++i) {
855 int32_t overlap=spanBackLengths[i];
856 // For longest match, we do need to try to match even an all-contained string
857 // to find the match from the latest end.
858
859 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
860 const UChar *s16=string.getBuffer();
861 int32_t length16=string.length();
862
863 // Try to match this string at pos-(length16-overlap)..pos-length16.
864 if(overlap>=LONG_SPAN) {
865 overlap=length16;
866 // Longest match: Need to match fully inside the code point span
867 // to find the match from the latest end.
868 }
869 if(overlap>spanLength) {
870 overlap=spanLength;
871 }
872 int32_t dec=length16-overlap; // Keep dec+overlap==length16.
873 for(;;) {
874 if(dec>pos || overlap<maxOverlap) {
875 break;
876 }
877 // Try to match if the string is longer or ends later.
878 if( (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
879 matches16CPB(s, pos-dec, length, s16, length16)
880 ) {
881 maxDec=dec; // Longest match from latest end.
882 maxOverlap=overlap;
883 break;
884 }
885 --overlap;
886 ++dec;
887 }
888 }
889
890 if(maxDec!=0 || maxOverlap!=0) {
891 // Longest-match algorithm, and there was a string match.
892 // Simply continue before it.
893 pos-=maxDec;
894 if(pos==0) {
895 return 0; // Reached the start of the string.
896 }
897 spanLength=0; // Match strings from before a string match.
898 continue;
899 }
900 }
901 // Finished trying to match all strings at pos.
902
903 if(spanLength!=0 || pos==length) {
904 // The position is before an unlimited code point span (spanLength!=0),
905 // not before a string match.
906 // The only position where spanLength==0 before a span is pos==length.
907 // Otherwise, an unlimited code point span is only tried again when no
908 // strings match, and if such a non-initial span fails we stop.
909 if(offsets.isEmpty()) {
910 return pos; // No strings matched before a span.
911 }
912 // Match strings from before the next string match.
913 } else {
914 // The position is before a string match (or a single code point).
915 if(offsets.isEmpty()) {
916 // No more strings matched before a previous string match.
917 // Try another code point span from before the last string match.
918 int32_t oldPos=pos;
919 pos=spanSet.spanBack(s, oldPos, USET_SPAN_CONTAINED);
920 spanLength=oldPos-pos;
921 if( pos==0 || // Reached the start of the string, or
922 spanLength==0 // neither strings nor span progressed.
923 ) {
924 return pos;
925 }
926 continue; // spanLength>0: Match strings from before a span.
927 } else {
928 // Try to match only one code point from before a string match if some
929 // string matched beyond it, so that we try all possible positions
930 // and don't overshoot.
931 spanLength=spanOneBack(spanSet, s, pos);
932 if(spanLength>0) {
933 if(spanLength==pos) {
934 return 0; // Reached the start of the string.
935 }
936 // Match strings before this code point.
937 // There cannot be any decrements below it because UnicodeSet strings
938 // contain multiple code points.
939 pos-=spanLength;
940 offsets.shift(spanLength);
941 spanLength=0;
942 continue; // Match strings from before a single code point.
943 }
944 // Match strings from before the next string match.
945 }
946 }
947 pos-=offsets.popMinimum();
948 spanLength=0; // Match strings from before a string match.
949 }
950 }
951
spanUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const952 int32_t UnicodeSetStringSpan::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
953 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
954 return spanNotUTF8(s, length);
955 }
956 int32_t spanLength=spanSet.spanUTF8((const char *)s, length, USET_SPAN_CONTAINED);
957 if(spanLength==length) {
958 return length;
959 }
960
961 // Consider strings; they may overlap with the span.
962 OffsetList offsets;
963 if(spanCondition==USET_SPAN_CONTAINED) {
964 // Use offset list to try all possibilities.
965 offsets.setMaxLength(maxLength8);
966 }
967 int32_t pos=spanLength, rest=length-pos;
968 int32_t i, stringsLength=strings.size();
969 uint8_t *spanUTF8Lengths=spanLengths;
970 if(all) {
971 spanUTF8Lengths+=2*stringsLength;
972 }
973 for(;;) {
974 const uint8_t *s8=utf8;
975 int32_t length8;
976 if(spanCondition==USET_SPAN_CONTAINED) {
977 for(i=0; i<stringsLength; ++i) {
978 length8=utf8Lengths[i];
979 if(length8==0) {
980 continue; // String not representable in UTF-8.
981 }
982 int32_t overlap=spanUTF8Lengths[i];
983 if(overlap==ALL_CP_CONTAINED) {
984 s8+=length8;
985 continue; // Irrelevant string.
986 }
987
988 // Try to match this string at pos-overlap..pos.
989 if(overlap>=LONG_SPAN) {
990 overlap=length8;
991 // While contained: No point matching fully inside the code point span.
992 U8_BACK_1(s8, 0, overlap); // Length of the string minus the last code point.
993 }
994 if(overlap>spanLength) {
995 overlap=spanLength;
996 }
997 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
998 for(;;) {
999 if(inc>rest) {
1000 break;
1001 }
1002 // Try to match if the increment is not listed already.
1003 // Match at code point boundaries. (The UTF-8 strings were converted
1004 // from UTF-16 and are guaranteed to be well-formed.)
1005 if( !U8_IS_TRAIL(s[pos-overlap]) &&
1006 !offsets.containsOffset(inc) &&
1007 matches8(s+pos-overlap, s8, length8)
1008
1009 ) {
1010 if(inc==rest) {
1011 return length; // Reached the end of the string.
1012 }
1013 offsets.addOffset(inc);
1014 }
1015 if(overlap==0) {
1016 break;
1017 }
1018 --overlap;
1019 ++inc;
1020 }
1021 s8+=length8;
1022 }
1023 } else /* USET_SPAN_SIMPLE */ {
1024 int32_t maxInc=0, maxOverlap=0;
1025 for(i=0; i<stringsLength; ++i) {
1026 length8=utf8Lengths[i];
1027 if(length8==0) {
1028 continue; // String not representable in UTF-8.
1029 }
1030 int32_t overlap=spanUTF8Lengths[i];
1031 // For longest match, we do need to try to match even an all-contained string
1032 // to find the match from the earliest start.
1033
1034 // Try to match this string at pos-overlap..pos.
1035 if(overlap>=LONG_SPAN) {
1036 overlap=length8;
1037 // Longest match: Need to match fully inside the code point span
1038 // to find the match from the earliest start.
1039 }
1040 if(overlap>spanLength) {
1041 overlap=spanLength;
1042 }
1043 int32_t inc=length8-overlap; // Keep overlap+inc==length8.
1044 for(;;) {
1045 if(inc>rest || overlap<maxOverlap) {
1046 break;
1047 }
1048 // Try to match if the string is longer or starts earlier.
1049 // Match at code point boundaries. (The UTF-8 strings were converted
1050 // from UTF-16 and are guaranteed to be well-formed.)
1051 if( !U8_IS_TRAIL(s[pos-overlap]) &&
1052 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ inc>maxInc) &&
1053 matches8(s+pos-overlap, s8, length8)
1054
1055 ) {
1056 maxInc=inc; // Longest match from earliest start.
1057 maxOverlap=overlap;
1058 break;
1059 }
1060 --overlap;
1061 ++inc;
1062 }
1063 s8+=length8;
1064 }
1065
1066 if(maxInc!=0 || maxOverlap!=0) {
1067 // Longest-match algorithm, and there was a string match.
1068 // Simply continue after it.
1069 pos+=maxInc;
1070 rest-=maxInc;
1071 if(rest==0) {
1072 return length; // Reached the end of the string.
1073 }
1074 spanLength=0; // Match strings from after a string match.
1075 continue;
1076 }
1077 }
1078 // Finished trying to match all strings at pos.
1079
1080 if(spanLength!=0 || pos==0) {
1081 // The position is after an unlimited code point span (spanLength!=0),
1082 // not after a string match.
1083 // The only position where spanLength==0 after a span is pos==0.
1084 // Otherwise, an unlimited code point span is only tried again when no
1085 // strings match, and if such a non-initial span fails we stop.
1086 if(offsets.isEmpty()) {
1087 return pos; // No strings matched after a span.
1088 }
1089 // Match strings from after the next string match.
1090 } else {
1091 // The position is after a string match (or a single code point).
1092 if(offsets.isEmpty()) {
1093 // No more strings matched after a previous string match.
1094 // Try another code point span from after the last string match.
1095 spanLength=spanSet.spanUTF8((const char *)s+pos, rest, USET_SPAN_CONTAINED);
1096 if( spanLength==rest || // Reached the end of the string, or
1097 spanLength==0 // neither strings nor span progressed.
1098 ) {
1099 return pos+spanLength;
1100 }
1101 pos+=spanLength;
1102 rest-=spanLength;
1103 continue; // spanLength>0: Match strings from after a span.
1104 } else {
1105 // Try to match only one code point from after a string match if some
1106 // string matched beyond it, so that we try all possible positions
1107 // and don't overshoot.
1108 spanLength=spanOneUTF8(spanSet, s+pos, rest);
1109 if(spanLength>0) {
1110 if(spanLength==rest) {
1111 return length; // Reached the end of the string.
1112 }
1113 // Match strings after this code point.
1114 // There cannot be any increments below it because UnicodeSet strings
1115 // contain multiple code points.
1116 pos+=spanLength;
1117 rest-=spanLength;
1118 offsets.shift(spanLength);
1119 spanLength=0;
1120 continue; // Match strings from after a single code point.
1121 }
1122 // Match strings from after the next string match.
1123 }
1124 }
1125 int32_t minOffset=offsets.popMinimum();
1126 pos+=minOffset;
1127 rest-=minOffset;
1128 spanLength=0; // Match strings from after a string match.
1129 }
1130 }
1131
spanBackUTF8(const uint8_t * s,int32_t length,USetSpanCondition spanCondition) const1132 int32_t UnicodeSetStringSpan::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const {
1133 if(spanCondition==USET_SPAN_NOT_CONTAINED) {
1134 return spanNotBackUTF8(s, length);
1135 }
1136 int32_t pos=spanSet.spanBackUTF8((const char *)s, length, USET_SPAN_CONTAINED);
1137 if(pos==0) {
1138 return 0;
1139 }
1140 int32_t spanLength=length-pos;
1141
1142 // Consider strings; they may overlap with the span.
1143 OffsetList offsets;
1144 if(spanCondition==USET_SPAN_CONTAINED) {
1145 // Use offset list to try all possibilities.
1146 offsets.setMaxLength(maxLength8);
1147 }
1148 int32_t i, stringsLength=strings.size();
1149 uint8_t *spanBackUTF8Lengths=spanLengths;
1150 if(all) {
1151 spanBackUTF8Lengths+=3*stringsLength;
1152 }
1153 for(;;) {
1154 const uint8_t *s8=utf8;
1155 int32_t length8;
1156 if(spanCondition==USET_SPAN_CONTAINED) {
1157 for(i=0; i<stringsLength; ++i) {
1158 length8=utf8Lengths[i];
1159 if(length8==0) {
1160 continue; // String not representable in UTF-8.
1161 }
1162 int32_t overlap=spanBackUTF8Lengths[i];
1163 if(overlap==ALL_CP_CONTAINED) {
1164 s8+=length8;
1165 continue; // Irrelevant string.
1166 }
1167
1168 // Try to match this string at pos-(length8-overlap)..pos-length8.
1169 if(overlap>=LONG_SPAN) {
1170 overlap=length8;
1171 // While contained: No point matching fully inside the code point span.
1172 int32_t len1=0;
1173 U8_FWD_1(s8, len1, overlap);
1174 overlap-=len1; // Length of the string minus the first code point.
1175 }
1176 if(overlap>spanLength) {
1177 overlap=spanLength;
1178 }
1179 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1180 for(;;) {
1181 if(dec>pos) {
1182 break;
1183 }
1184 // Try to match if the decrement is not listed already.
1185 // Match at code point boundaries. (The UTF-8 strings were converted
1186 // from UTF-16 and are guaranteed to be well-formed.)
1187 if( !U8_IS_TRAIL(s[pos-dec]) &&
1188 !offsets.containsOffset(dec) &&
1189 matches8(s+pos-dec, s8, length8)
1190 ) {
1191 if(dec==pos) {
1192 return 0; // Reached the start of the string.
1193 }
1194 offsets.addOffset(dec);
1195 }
1196 if(overlap==0) {
1197 break;
1198 }
1199 --overlap;
1200 ++dec;
1201 }
1202 s8+=length8;
1203 }
1204 } else /* USET_SPAN_SIMPLE */ {
1205 int32_t maxDec=0, maxOverlap=0;
1206 for(i=0; i<stringsLength; ++i) {
1207 length8=utf8Lengths[i];
1208 if(length8==0) {
1209 continue; // String not representable in UTF-8.
1210 }
1211 int32_t overlap=spanBackUTF8Lengths[i];
1212 // For longest match, we do need to try to match even an all-contained string
1213 // to find the match from the latest end.
1214
1215 // Try to match this string at pos-(length8-overlap)..pos-length8.
1216 if(overlap>=LONG_SPAN) {
1217 overlap=length8;
1218 // Longest match: Need to match fully inside the code point span
1219 // to find the match from the latest end.
1220 }
1221 if(overlap>spanLength) {
1222 overlap=spanLength;
1223 }
1224 int32_t dec=length8-overlap; // Keep dec+overlap==length8.
1225 for(;;) {
1226 if(dec>pos || overlap<maxOverlap) {
1227 break;
1228 }
1229 // Try to match if the string is longer or ends later.
1230 // Match at code point boundaries. (The UTF-8 strings were converted
1231 // from UTF-16 and are guaranteed to be well-formed.)
1232 if( !U8_IS_TRAIL(s[pos-dec]) &&
1233 (overlap>maxOverlap || /* redundant overlap==maxOverlap && */ dec>maxDec) &&
1234 matches8(s+pos-dec, s8, length8)
1235 ) {
1236 maxDec=dec; // Longest match from latest end.
1237 maxOverlap=overlap;
1238 break;
1239 }
1240 --overlap;
1241 ++dec;
1242 }
1243 s8+=length8;
1244 }
1245
1246 if(maxDec!=0 || maxOverlap!=0) {
1247 // Longest-match algorithm, and there was a string match.
1248 // Simply continue before it.
1249 pos-=maxDec;
1250 if(pos==0) {
1251 return 0; // Reached the start of the string.
1252 }
1253 spanLength=0; // Match strings from before a string match.
1254 continue;
1255 }
1256 }
1257 // Finished trying to match all strings at pos.
1258
1259 if(spanLength!=0 || pos==length) {
1260 // The position is before an unlimited code point span (spanLength!=0),
1261 // not before a string match.
1262 // The only position where spanLength==0 before a span is pos==length.
1263 // Otherwise, an unlimited code point span is only tried again when no
1264 // strings match, and if such a non-initial span fails we stop.
1265 if(offsets.isEmpty()) {
1266 return pos; // No strings matched before a span.
1267 }
1268 // Match strings from before the next string match.
1269 } else {
1270 // The position is before a string match (or a single code point).
1271 if(offsets.isEmpty()) {
1272 // No more strings matched before a previous string match.
1273 // Try another code point span from before the last string match.
1274 int32_t oldPos=pos;
1275 pos=spanSet.spanBackUTF8((const char *)s, oldPos, USET_SPAN_CONTAINED);
1276 spanLength=oldPos-pos;
1277 if( pos==0 || // Reached the start of the string, or
1278 spanLength==0 // neither strings nor span progressed.
1279 ) {
1280 return pos;
1281 }
1282 continue; // spanLength>0: Match strings from before a span.
1283 } else {
1284 // Try to match only one code point from before a string match if some
1285 // string matched beyond it, so that we try all possible positions
1286 // and don't overshoot.
1287 spanLength=spanOneBackUTF8(spanSet, s, pos);
1288 if(spanLength>0) {
1289 if(spanLength==pos) {
1290 return 0; // Reached the start of the string.
1291 }
1292 // Match strings before this code point.
1293 // There cannot be any decrements below it because UnicodeSet strings
1294 // contain multiple code points.
1295 pos-=spanLength;
1296 offsets.shift(spanLength);
1297 spanLength=0;
1298 continue; // Match strings from before a single code point.
1299 }
1300 // Match strings from before the next string match.
1301 }
1302 }
1303 pos-=offsets.popMinimum();
1304 spanLength=0; // Match strings from before a string match.
1305 }
1306 }
1307
1308 /*
1309 * Algorithm for spanNot()==span(USET_SPAN_NOT_CONTAINED)
1310 *
1311 * Theoretical algorithm:
1312 * - Iterate through the string, and at each code point boundary:
1313 * + If the code point there is in the set, then return with the current position.
1314 * + If a set string matches at the current position, then return with the current position.
1315 *
1316 * Optimized implementation:
1317 *
1318 * (Same assumption as for span() above.)
1319 *
1320 * Create and cache a spanNotSet which contains all of the single code points
1321 * of the original set but none of its strings.
1322 * For each set string add its initial code point to the spanNotSet.
1323 * (Also add its final code point for spanNotBack().)
1324 *
1325 * - Loop:
1326 * + Do spanLength=spanNotSet.span(USET_SPAN_NOT_CONTAINED).
1327 * + If the current code point is in the original set, then
1328 * return the current position.
1329 * + If any set string matches at the current position, then
1330 * return the current position.
1331 * + If there is no match at the current position, neither for the code point there
1332 * nor for any set string, then skip this code point and continue the loop.
1333 * This happens for set-string-initial code points that were added to spanNotSet
1334 * when there is not actually a match for such a set string.
1335 */
1336
spanNot(const UChar * s,int32_t length) const1337 int32_t UnicodeSetStringSpan::spanNot(const UChar *s, int32_t length) const {
1338 int32_t pos=0, rest=length;
1339 int32_t i, stringsLength=strings.size();
1340 do {
1341 // Span until we find a code point from the set,
1342 // or a code point that starts or ends some string.
1343 i=pSpanNotSet->span(s+pos, rest, USET_SPAN_NOT_CONTAINED);
1344 if(i==rest) {
1345 return length; // Reached the end of the string.
1346 }
1347 pos+=i;
1348 rest-=i;
1349
1350 // Check whether the current code point is in the original set,
1351 // without the string starts and ends.
1352 int32_t cpLength=spanOne(spanSet, s+pos, rest);
1353 if(cpLength>0) {
1354 return pos; // There is a set element at pos.
1355 }
1356
1357 // Try to match the strings at pos.
1358 for(i=0; i<stringsLength; ++i) {
1359 if(spanLengths[i]==ALL_CP_CONTAINED) {
1360 continue; // Irrelevant string.
1361 }
1362 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1363 const UChar *s16=string.getBuffer();
1364 int32_t length16=string.length();
1365 if(length16<=rest && matches16CPB(s, pos, length, s16, length16)) {
1366 return pos; // There is a set element at pos.
1367 }
1368 }
1369
1370 // The span(while not contained) ended on a string start/end which is
1371 // not in the original set. Skip this code point and continue.
1372 // cpLength<0
1373 pos-=cpLength;
1374 rest+=cpLength;
1375 } while(rest!=0);
1376 return length; // Reached the end of the string.
1377 }
1378
spanNotBack(const UChar * s,int32_t length) const1379 int32_t UnicodeSetStringSpan::spanNotBack(const UChar *s, int32_t length) const {
1380 int32_t pos=length;
1381 int32_t i, stringsLength=strings.size();
1382 do {
1383 // Span until we find a code point from the set,
1384 // or a code point that starts or ends some string.
1385 pos=pSpanNotSet->spanBack(s, pos, USET_SPAN_NOT_CONTAINED);
1386 if(pos==0) {
1387 return 0; // Reached the start of the string.
1388 }
1389
1390 // Check whether the current code point is in the original set,
1391 // without the string starts and ends.
1392 int32_t cpLength=spanOneBack(spanSet, s, pos);
1393 if(cpLength>0) {
1394 return pos; // There is a set element at pos.
1395 }
1396
1397 // Try to match the strings at pos.
1398 for(i=0; i<stringsLength; ++i) {
1399 // Use spanLengths rather than a spanBackLengths pointer because
1400 // it is easier and we only need to know whether the string is irrelevant
1401 // which is the same in either array.
1402 if(spanLengths[i]==ALL_CP_CONTAINED) {
1403 continue; // Irrelevant string.
1404 }
1405 const UnicodeString &string=*(const UnicodeString *)strings.elementAt(i);
1406 const UChar *s16=string.getBuffer();
1407 int32_t length16=string.length();
1408 if(length16<=pos && matches16CPB(s, pos-length16, length, s16, length16)) {
1409 return pos; // There is a set element at pos.
1410 }
1411 }
1412
1413 // The span(while not contained) ended on a string start/end which is
1414 // not in the original set. Skip this code point and continue.
1415 // cpLength<0
1416 pos+=cpLength;
1417 } while(pos!=0);
1418 return 0; // Reached the start of the string.
1419 }
1420
spanNotUTF8(const uint8_t * s,int32_t length) const1421 int32_t UnicodeSetStringSpan::spanNotUTF8(const uint8_t *s, int32_t length) const {
1422 int32_t pos=0, rest=length;
1423 int32_t i, stringsLength=strings.size();
1424 uint8_t *spanUTF8Lengths=spanLengths;
1425 if(all) {
1426 spanUTF8Lengths+=2*stringsLength;
1427 }
1428 do {
1429 // Span until we find a code point from the set,
1430 // or a code point that starts or ends some string.
1431 i=pSpanNotSet->spanUTF8((const char *)s+pos, rest, USET_SPAN_NOT_CONTAINED);
1432 if(i==rest) {
1433 return length; // Reached the end of the string.
1434 }
1435 pos+=i;
1436 rest-=i;
1437
1438 // Check whether the current code point is in the original set,
1439 // without the string starts and ends.
1440 int32_t cpLength=spanOneUTF8(spanSet, s+pos, rest);
1441 if(cpLength>0) {
1442 return pos; // There is a set element at pos.
1443 }
1444
1445 // Try to match the strings at pos.
1446 const uint8_t *s8=utf8;
1447 int32_t length8;
1448 for(i=0; i<stringsLength; ++i) {
1449 length8=utf8Lengths[i];
1450 // ALL_CP_CONTAINED: Irrelevant string.
1451 if(length8!=0 && spanUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=rest && matches8(s+pos, s8, length8)) {
1452 return pos; // There is a set element at pos.
1453 }
1454 s8+=length8;
1455 }
1456
1457 // The span(while not contained) ended on a string start/end which is
1458 // not in the original set. Skip this code point and continue.
1459 // cpLength<0
1460 pos-=cpLength;
1461 rest+=cpLength;
1462 } while(rest!=0);
1463 return length; // Reached the end of the string.
1464 }
1465
spanNotBackUTF8(const uint8_t * s,int32_t length) const1466 int32_t UnicodeSetStringSpan::spanNotBackUTF8(const uint8_t *s, int32_t length) const {
1467 int32_t pos=length;
1468 int32_t i, stringsLength=strings.size();
1469 uint8_t *spanBackUTF8Lengths=spanLengths;
1470 if(all) {
1471 spanBackUTF8Lengths+=3*stringsLength;
1472 }
1473 do {
1474 // Span until we find a code point from the set,
1475 // or a code point that starts or ends some string.
1476 pos=pSpanNotSet->spanBackUTF8((const char *)s, pos, USET_SPAN_NOT_CONTAINED);
1477 if(pos==0) {
1478 return 0; // Reached the start of the string.
1479 }
1480
1481 // Check whether the current code point is in the original set,
1482 // without the string starts and ends.
1483 int32_t cpLength=spanOneBackUTF8(spanSet, s, pos);
1484 if(cpLength>0) {
1485 return pos; // There is a set element at pos.
1486 }
1487
1488 // Try to match the strings at pos.
1489 const uint8_t *s8=utf8;
1490 int32_t length8;
1491 for(i=0; i<stringsLength; ++i) {
1492 length8=utf8Lengths[i];
1493 // ALL_CP_CONTAINED: Irrelevant string.
1494 if(length8!=0 && spanBackUTF8Lengths[i]!=ALL_CP_CONTAINED && length8<=pos && matches8(s+pos-length8, s8, length8)) {
1495 return pos; // There is a set element at pos.
1496 }
1497 s8+=length8;
1498 }
1499
1500 // The span(while not contained) ended on a string start/end which is
1501 // not in the original set. Skip this code point and continue.
1502 // cpLength<0
1503 pos+=cpLength;
1504 } while(pos!=0);
1505 return 0; // Reached the start of the string.
1506 }
1507
1508 U_NAMESPACE_END
1509