• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 1999-2015, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 *   Date        Name        Description
9 *   10/20/99    alan        Creation.
10 **********************************************************************
11 */
12 
13 #include "unicode/utypes.h"
14 #include "unicode/parsepos.h"
15 #include "unicode/symtable.h"
16 #include "unicode/uniset.h"
17 #include "unicode/ustring.h"
18 #include "unicode/utf8.h"
19 #include "unicode/utf16.h"
20 #include "ruleiter.h"
21 #include "cmemory.h"
22 #include "cstring.h"
23 #include "patternprops.h"
24 #include "uelement.h"
25 #include "util.h"
26 #include "uvector.h"
27 #include "charstr.h"
28 #include "ustrfmt.h"
29 #include "uassert.h"
30 #include "bmpset.h"
31 #include "unisetspan.h"
32 
33 // HIGH_VALUE > all valid values. 110000 for codepoints
34 #define UNICODESET_HIGH 0x0110000
35 
36 // LOW <= all valid values. ZERO for codepoints
37 #define UNICODESET_LOW 0x000000
38 
39 /** Max list [0, 1, 2, ..., max code point, HIGH] */
40 constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
41 
42 U_NAMESPACE_BEGIN
43 
~SymbolTable()44 SymbolTable::~SymbolTable() {}
45 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)46 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
47 
48 /**
49  * Modify the given UChar32 variable so that it is in range, by
50  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
51  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
52  * It modifies its argument in-place and also returns it.
53  */
54 static inline UChar32 pinCodePoint(UChar32& c) {
55     if (c < UNICODESET_LOW) {
56         c = UNICODESET_LOW;
57     } else if (c > (UNICODESET_HIGH-1)) {
58         c = (UNICODESET_HIGH-1);
59     }
60     return c;
61 }
62 
63 //----------------------------------------------------------------
64 // Debugging
65 //----------------------------------------------------------------
66 
67 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
68 // To enable the debugging, define the symbol DEBUG_MEM in the line
69 // below.  This will result in text being sent to stdout that looks
70 // like this:
71 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
72 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
73 // Each line lists a construction (ct) or destruction (dt) event, the
74 // object address, the number of outstanding objects after the event,
75 // and the pattern of the object in question.
76 
77 // #define DEBUG_MEM
78 
79 #ifdef DEBUG_MEM
80 #include <stdio.h>
81 static int32_t _dbgCount = 0;
82 
_dbgct(UnicodeSet * set)83 static inline void _dbgct(UnicodeSet* set) {
84     UnicodeString str;
85     set->toPattern(str, true);
86     char buf[40];
87     str.extract(0, 39, buf, "");
88     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
89 }
90 
_dbgdt(UnicodeSet * set)91 static inline void _dbgdt(UnicodeSet* set) {
92     UnicodeString str;
93     set->toPattern(str, true);
94     char buf[40];
95     str.extract(0, 39, buf, "");
96     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
97 }
98 
99 #else
100 
101 #define _dbgct(set)
102 #define _dbgdt(set)
103 
104 #endif
105 
106 //----------------------------------------------------------------
107 // UnicodeString in UVector support
108 //----------------------------------------------------------------
109 
cloneUnicodeString(UElement * dst,UElement * src)110 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
111     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
112 }
113 
compareUnicodeString(UElement t1,UElement t2)114 static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
115     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
116     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
117     return a.compare(b);
118 }
119 
hasStrings() const120 UBool UnicodeSet::hasStrings() const {
121     return strings != nullptr && !strings->isEmpty();
122 }
123 
stringsSize() const124 int32_t UnicodeSet::stringsSize() const {
125     return strings == nullptr ? 0 : strings->size();
126 }
127 
stringsContains(const UnicodeString & s) const128 UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
129     return strings != nullptr && strings->contains((void*) &s);
130 }
131 
132 //----------------------------------------------------------------
133 // Constructors &c
134 //----------------------------------------------------------------
135 
136 /**
137  * Constructs an empty set.
138  */
UnicodeSet()139 UnicodeSet::UnicodeSet() {
140     list[0] = UNICODESET_HIGH;
141     _dbgct(this);
142 }
143 
144 /**
145  * Constructs a set containing the given range. If <code>end >
146  * start</code> then an empty set is created.
147  *
148  * @param start first character, inclusive, of range
149  * @param end last character, inclusive, of range
150  */
UnicodeSet(UChar32 start,UChar32 end)151 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
152     list[0] = UNICODESET_HIGH;
153     add(start, end);
154     _dbgct(this);
155 }
156 
157 /**
158  * Constructs a set that is identical to the given UnicodeSet.
159  */
UnicodeSet(const UnicodeSet & o)160 UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
161     *this = o;
162     _dbgct(this);
163 }
164 
165 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)166 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
167     if (ensureCapacity(o.len)) {
168         // *this = o except for bmpSet and stringSpan
169         len = o.len;
170         uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
171         if (o.hasStrings()) {
172             UErrorCode status = U_ZERO_ERROR;
173             if (!allocateStrings(status) ||
174                     (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
175                 setToBogus();
176                 return;
177             }
178         }
179         if (o.pat) {
180             setPattern(o.pat, o.patLen);
181         }
182         _dbgct(this);
183     }
184 }
185 
186 /**
187  * Destructs the set.
188  */
~UnicodeSet()189 UnicodeSet::~UnicodeSet() {
190     _dbgdt(this); // first!
191     if (list != stackList) {
192         uprv_free(list);
193     }
194     delete bmpSet;
195     if (buffer != stackList) {
196         uprv_free(buffer);
197     }
198     delete strings;
199     delete stringSpan;
200     releasePattern();
201 }
202 
203 /**
204  * Assigns this object to be a copy of another.
205  */
operator =(const UnicodeSet & o)206 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
207     return copyFrom(o, false);
208 }
209 
copyFrom(const UnicodeSet & o,UBool asThawed)210 UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
211     if (this == &o) {
212         return *this;
213     }
214     if (isFrozen()) {
215         return *this;
216     }
217     if (o.isBogus()) {
218         setToBogus();
219         return *this;
220     }
221     if (!ensureCapacity(o.len)) {
222         // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
223         return *this;
224     }
225     len = o.len;
226     uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
227     if (o.bmpSet != nullptr && !asThawed) {
228         bmpSet = new BMPSet(*o.bmpSet, list, len);
229         if (bmpSet == nullptr) { // Check for memory allocation error.
230             setToBogus();
231             return *this;
232         }
233     }
234     if (o.hasStrings()) {
235         UErrorCode status = U_ZERO_ERROR;
236         if ((strings == nullptr && !allocateStrings(status)) ||
237                 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
238             setToBogus();
239             return *this;
240         }
241     } else if (hasStrings()) {
242         strings->removeAllElements();
243     }
244     if (o.stringSpan != nullptr && !asThawed) {
245         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
246         if (stringSpan == nullptr) { // Check for memory allocation error.
247             setToBogus();
248             return *this;
249         }
250     }
251     releasePattern();
252     if (o.pat) {
253         setPattern(o.pat, o.patLen);
254     }
255     return *this;
256 }
257 
258 /**
259  * Returns a copy of this object.  All UnicodeMatcher objects have
260  * to support cloning in order to allow classes using
261  * UnicodeMatchers, such as Transliterator, to implement cloning.
262  */
clone() const263 UnicodeSet* UnicodeSet::clone() const {
264     return new UnicodeSet(*this);
265 }
266 
cloneAsThawed() const267 UnicodeSet *UnicodeSet::cloneAsThawed() const {
268     return new UnicodeSet(*this, true);
269 }
270 
271 /**
272  * Compares the specified object with this set for equality.  Returns
273  * <tt>true</tt> if the two sets
274  * have the same size, and every member of the specified set is
275  * contained in this set (or equivalently, every member of this set is
276  * contained in the specified set).
277  *
278  * @param o set to be compared for equality with this set.
279  * @return <tt>true</tt> if the specified set is equal to this set.
280  */
operator ==(const UnicodeSet & o) const281 bool UnicodeSet::operator==(const UnicodeSet& o) const {
282     if (len != o.len) return false;
283     for (int32_t i = 0; i < len; ++i) {
284         if (list[i] != o.list[i]) return false;
285     }
286     if (hasStrings() != o.hasStrings()) { return false; }
287     if (hasStrings() && *strings != *o.strings) return false;
288     return true;
289 }
290 
291 /**
292  * Returns the hash code value for this set.
293  *
294  * @return the hash code value for this set.
295  * @see Object#hashCode()
296  */
hashCode() const297 int32_t UnicodeSet::hashCode() const {
298     uint32_t result = static_cast<uint32_t>(len);
299     for (int32_t i = 0; i < len; ++i) {
300         result *= 1000003u;
301         result += list[i];
302     }
303     return static_cast<int32_t>(result);
304 }
305 
306 //----------------------------------------------------------------
307 // Public API
308 //----------------------------------------------------------------
309 
310 /**
311  * Returns the number of elements in this set (its cardinality),
312  * Note than the elements of a set may include both individual
313  * codepoints and strings.
314  *
315  * @return the number of elements in this set (its cardinality).
316  */
size() const317 int32_t UnicodeSet::size() const {
318     int32_t n = 0;
319     int32_t count = getRangeCount();
320     for (int32_t i = 0; i < count; ++i) {
321         n += getRangeEnd(i) - getRangeStart(i) + 1;
322     }
323     return n + stringsSize();
324 }
325 
326 /**
327  * Returns <tt>true</tt> if this set contains no elements.
328  *
329  * @return <tt>true</tt> if this set contains no elements.
330  */
isEmpty() const331 UBool UnicodeSet::isEmpty() const {
332     return len == 1 && !hasStrings();
333 }
334 
335 /**
336  * Returns true if this set contains the given character.
337  * @param c character to be checked for containment
338  * @return true if the test condition is met
339  */
contains(UChar32 c) const340 UBool UnicodeSet::contains(UChar32 c) const {
341     // Set i to the index of the start item greater than ch
342     // We know we will terminate without length test!
343     // LATER: for large sets, add binary search
344     //int32_t i = -1;
345     //for (;;) {
346     //    if (c < list[++i]) break;
347     //}
348     if (bmpSet != nullptr) {
349         return bmpSet->contains(c);
350     }
351     if (stringSpan != nullptr) {
352         return stringSpan->contains(c);
353     }
354     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
355         return false;
356     }
357     int32_t i = findCodePoint(c);
358     return (UBool)(i & 1); // return true if odd
359 }
360 
361 /**
362  * Returns the smallest value i such that c < list[i].  Caller
363  * must ensure that c is a legal value or this method will enter
364  * an infinite loop.  This method performs a binary search.
365  * @param c a character in the range MIN_VALUE..MAX_VALUE
366  * inclusive
367  * @return the smallest integer i in the range 0..len-1,
368  * inclusive, such that c < list[i]
369  */
findCodePoint(UChar32 c) const370 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
371     /* Examples:
372                                        findCodePoint(c)
373        set              list[]         c=0 1 3 4 7 8
374        ===              ==============   ===========
375        []               [110000]         0 0 0 0 0 0
376        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
377        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
378        [:Any:]          [0, 110000]      1 1 1 1 1 1
379      */
380 
381     // Return the smallest i such that c < list[i].  Assume
382     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
383     if (c < list[0])
384         return 0;
385     // High runner test.  c is often after the last range, so an
386     // initial check for this condition pays off.
387     int32_t lo = 0;
388     int32_t hi = len - 1;
389     if (lo >= hi || c >= list[hi-1])
390         return hi;
391     // invariant: c >= list[lo]
392     // invariant: c < list[hi]
393     for (;;) {
394         int32_t i = (lo + hi) >> 1;
395         if (i == lo) {
396             break; // Found!
397         } else if (c < list[i]) {
398             hi = i;
399         } else {
400             lo = i;
401         }
402     }
403     return hi;
404 }
405 
406 /**
407  * Returns true if this set contains every character
408  * of the given range.
409  * @param start first character, inclusive, of the range
410  * @param end last character, inclusive, of the range
411  * @return true if the test condition is met
412  */
contains(UChar32 start,UChar32 end) const413 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
414     //int32_t i = -1;
415     //for (;;) {
416     //    if (start < list[++i]) break;
417     //}
418     int32_t i = findCodePoint(start);
419     return ((i & 1) != 0 && end < list[i]);
420 }
421 
422 /**
423  * Returns <tt>true</tt> if this set contains the given
424  * multicharacter string.
425  * @param s string to be checked for containment
426  * @return <tt>true</tt> if this set contains the specified string
427  */
contains(const UnicodeString & s) const428 UBool UnicodeSet::contains(const UnicodeString& s) const {
429     int32_t cp = getSingleCP(s);
430     if (cp < 0) {
431         return stringsContains(s);
432     } else {
433         return contains((UChar32) cp);
434     }
435 }
436 
437 /**
438  * Returns true if this set contains all the characters and strings
439  * of the given set.
440  * @param c set to be checked for containment
441  * @return true if the test condition is met
442  */
containsAll(const UnicodeSet & c) const443 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
444     // The specified set is a subset if all of its pairs are contained in
445     // this set.  It's possible to code this more efficiently in terms of
446     // direct manipulation of the inversion lists if the need arises.
447     int32_t n = c.getRangeCount();
448     for (int i=0; i<n; ++i) {
449         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
450             return false;
451         }
452     }
453     return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
454 }
455 
456 /**
457  * Returns true if this set contains all the characters
458  * of the given string.
459  * @param s string containing characters to be checked for containment
460  * @return true if the test condition is met
461  */
containsAll(const UnicodeString & s) const462 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
463     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
464                    s.length());
465 }
466 
467 /**
468  * Returns true if this set contains none of the characters
469  * of the given range.
470  * @param start first character, inclusive, of the range
471  * @param end last character, inclusive, of the range
472  * @return true if the test condition is met
473  */
containsNone(UChar32 start,UChar32 end) const474 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
475     //int32_t i = -1;
476     //for (;;) {
477     //    if (start < list[++i]) break;
478     //}
479     int32_t i = findCodePoint(start);
480     return ((i & 1) == 0 && end < list[i]);
481 }
482 
483 /**
484  * Returns true if this set contains none of the characters and strings
485  * of the given set.
486  * @param c set to be checked for containment
487  * @return true if the test condition is met
488  */
containsNone(const UnicodeSet & c) const489 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
490     // The specified set is a subset if all of its pairs are contained in
491     // this set.  It's possible to code this more efficiently in terms of
492     // direct manipulation of the inversion lists if the need arises.
493     int32_t n = c.getRangeCount();
494     for (int32_t i=0; i<n; ++i) {
495         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
496             return false;
497         }
498     }
499     return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
500 }
501 
502 /**
503  * Returns true if this set contains none of the characters
504  * of the given string.
505  * @param s string containing characters to be checked for containment
506  * @return true if the test condition is met
507  */
containsNone(const UnicodeString & s) const508 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
509     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
510                    s.length());
511 }
512 
513 /**
514  * Returns <tt>true</tt> if this set contains any character whose low byte
515  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
516  * indexing.
517  */
matchesIndexValue(uint8_t v) const518 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
519     /* The index value v, in the range [0,255], is contained in this set if
520      * it is contained in any pair of this set.  Pairs either have the high
521      * bytes equal, or unequal.  If the high bytes are equal, then we have
522      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
523      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
524      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
525      * time zone month containment logic.)
526      */
527     int32_t i;
528     int32_t rangeCount=getRangeCount();
529     for (i=0; i<rangeCount; ++i) {
530         UChar32 low = getRangeStart(i);
531         UChar32 high = getRangeEnd(i);
532         if ((low & ~0xFF) == (high & ~0xFF)) {
533             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
534                 return true;
535             }
536         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
537             return true;
538         }
539     }
540     if (hasStrings()) {
541         for (i=0; i<strings->size(); ++i) {
542             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
543             if (s.isEmpty()) {
544                 continue;  // skip the empty string
545             }
546             UChar32 c = s.char32At(0);
547             if ((c & 0xFF) == v) {
548                 return true;
549             }
550         }
551     }
552     return false;
553 }
554 
555 /**
556  * Implementation of UnicodeMatcher::matches().  Always matches the
557  * longest possible multichar string.
558  */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)559 UMatchDegree UnicodeSet::matches(const Replaceable& text,
560                                  int32_t& offset,
561                                  int32_t limit,
562                                  UBool incremental) {
563     if (offset == limit) {
564         if (contains(U_ETHER)) {
565             return incremental ? U_PARTIAL_MATCH : U_MATCH;
566         } else {
567             return U_MISMATCH;
568         }
569     } else {
570         if (hasStrings()) { // try strings first
571 
572             // might separate forward and backward loops later
573             // for now they are combined
574 
575             // TODO Improve efficiency of this, at least in the forward
576             // direction, if not in both.  In the forward direction we
577             // can assume the strings are sorted.
578 
579             int32_t i;
580             UBool forward = offset < limit;
581 
582             // firstChar is the leftmost char to match in the
583             // forward direction or the rightmost char to match in
584             // the reverse direction.
585             char16_t firstChar = text.charAt(offset);
586 
587             // If there are multiple strings that can match we
588             // return the longest match.
589             int32_t highWaterLength = 0;
590 
591             for (i=0; i<strings->size(); ++i) {
592                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
593                 if (trial.isEmpty()) {
594                     continue;  // skip the empty string
595                 }
596 
597                 char16_t c = trial.charAt(forward ? 0 : trial.length() - 1);
598 
599                 // Strings are sorted, so we can optimize in the
600                 // forward direction.
601                 if (forward && c > firstChar) break;
602                 if (c != firstChar) continue;
603 
604                 int32_t matchLen = matchRest(text, offset, limit, trial);
605 
606                 if (incremental) {
607                     int32_t maxLen = forward ? limit-offset : offset-limit;
608                     if (matchLen == maxLen) {
609                         // We have successfully matched but only up to limit.
610                         return U_PARTIAL_MATCH;
611                     }
612                 }
613 
614                 if (matchLen == trial.length()) {
615                     // We have successfully matched the whole string.
616                     if (matchLen > highWaterLength) {
617                         highWaterLength = matchLen;
618                     }
619                     // In the forward direction we know strings
620                     // are sorted so we can bail early.
621                     if (forward && matchLen < highWaterLength) {
622                         break;
623                     }
624                     continue;
625                 }
626             }
627 
628             // We've checked all strings without a partial match.
629             // If we have full matches, return the longest one.
630             if (highWaterLength != 0) {
631                 offset += forward ? highWaterLength : -highWaterLength;
632                 return U_MATCH;
633             }
634         }
635         return UnicodeFilter::matches(text, offset, limit, incremental);
636     }
637 }
638 
639 /**
640  * Returns the longest match for s in text at the given position.
641  * If limit > start then match forward from start+1 to limit
642  * matching all characters except s.charAt(0).  If limit < start,
643  * go backward starting from start-1 matching all characters
644  * except s.charAt(s.length()-1).  This method assumes that the
645  * first character, text.charAt(start), matches s, so it does not
646  * check it.
647  * @param text the text to match
648  * @param start the first character to match.  In the forward
649  * direction, text.charAt(start) is matched against s.charAt(0).
650  * In the reverse direction, it is matched against
651  * s.charAt(s.length()-1).
652  * @param limit the limit offset for matching, either last+1 in
653  * the forward direction, or last-1 in the reverse direction,
654  * where last is the index of the last character to match.
655  * @return If part of s matches up to the limit, return |limit -
656  * start|.  If all of s matches before reaching the limit, return
657  * s.length().  If there is a mismatch between s and text, return
658  * 0
659  */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)660 int32_t UnicodeSet::matchRest(const Replaceable& text,
661                               int32_t start, int32_t limit,
662                               const UnicodeString& s) {
663     int32_t i;
664     int32_t maxLen;
665     int32_t slen = s.length();
666     if (start < limit) {
667         maxLen = limit - start;
668         if (maxLen > slen) maxLen = slen;
669         for (i = 1; i < maxLen; ++i) {
670             if (text.charAt(start + i) != s.charAt(i)) return 0;
671         }
672     } else {
673         maxLen = start - limit;
674         if (maxLen > slen) maxLen = slen;
675         --slen; // <=> slen = s.length() - 1;
676         for (i = 1; i < maxLen; ++i) {
677             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
678         }
679     }
680     return maxLen;
681 }
682 
683 /**
684  * Implement of UnicodeMatcher
685  */
addMatchSetTo(UnicodeSet & toUnionTo) const686 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
687     toUnionTo.addAll(*this);
688 }
689 
690 /**
691  * Returns the index of the given character within this set, where
692  * the set is ordered by ascending code point.  If the character
693  * is not in this set, return -1.  The inverse of this method is
694  * <code>charAt()</code>.
695  * @return an index from 0..size()-1, or -1
696  */
indexOf(UChar32 c) const697 int32_t UnicodeSet::indexOf(UChar32 c) const {
698     if (c < MIN_VALUE || c > MAX_VALUE) {
699         return -1;
700     }
701     int32_t i = 0;
702     int32_t n = 0;
703     for (;;) {
704         UChar32 start = list[i++];
705         if (c < start) {
706             return -1;
707         }
708         UChar32 limit = list[i++];
709         if (c < limit) {
710             return n + c - start;
711         }
712         n += limit - start;
713     }
714 }
715 
716 /**
717  * Returns the character at the given index within this set, where
718  * the set is ordered by ascending code point.  If the index is
719  * out of range, return (UChar32)-1.  The inverse of this method is
720  * <code>indexOf()</code>.
721  * @param index an index from 0..size()-1
722  * @return the character at the given index, or (UChar32)-1.
723  */
charAt(int32_t index) const724 UChar32 UnicodeSet::charAt(int32_t index) const {
725     if (index >= 0) {
726         // len2 is the largest even integer <= len, that is, it is len
727         // for even values and len-1 for odd values.  With odd values
728         // the last entry is UNICODESET_HIGH.
729         int32_t len2 = len & ~1;
730         for (int32_t i=0; i < len2;) {
731             UChar32 start = list[i++];
732             int32_t count = list[i++] - start;
733             if (index < count) {
734                 return (UChar32)(start + index);
735             }
736             index -= count;
737         }
738     }
739     return (UChar32)-1;
740 }
741 
742 /**
743  * Make this object represent the range <code>start - end</code>.
744  * If <code>end > start</code> then this object is set to an
745  * an empty range.
746  *
747  * @param start first character in the set, inclusive
748  * @rparam end last character in the set, inclusive
749  */
set(UChar32 start,UChar32 end)750 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
751     clear();
752     complement(start, end);
753     return *this;
754 }
755 
756 /**
757  * Adds the specified range to this set if it is not already
758  * present.  If this set already contains the specified range,
759  * the call leaves this set unchanged.  If <code>end > start</code>
760  * then an empty range is added, leaving the set unchanged.
761  *
762  * @param start first character, inclusive, of range to be added
763  * to this set.
764  * @param end last character, inclusive, of range to be added
765  * to this set.
766  */
add(UChar32 start,UChar32 end)767 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
768     if (pinCodePoint(start) < pinCodePoint(end)) {
769         UChar32 limit = end + 1;
770         // Fast path for adding a new range after the last one.
771         // Odd list length: [..., lastStart, lastLimit, HIGH]
772         if ((len & 1) != 0) {
773             // If the list is empty, set lastLimit low enough to not be adjacent to 0.
774             UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
775             if (lastLimit <= start && !isFrozen() && !isBogus()) {
776                 if (lastLimit == start) {
777                     // Extend the last range.
778                     list[len - 2] = limit;
779                     if (limit == UNICODESET_HIGH) {
780                         --len;
781                     }
782                 } else {
783                     list[len - 1] = start;
784                     if (limit < UNICODESET_HIGH) {
785                         if (ensureCapacity(len + 2)) {
786                             list[len++] = limit;
787                             list[len++] = UNICODESET_HIGH;
788                         }
789                     } else {  // limit == UNICODESET_HIGH
790                         if (ensureCapacity(len + 1)) {
791                             list[len++] = UNICODESET_HIGH;
792                         }
793                     }
794                 }
795                 releasePattern();
796                 return *this;
797             }
798         }
799         // This is slow. Could be much faster using findCodePoint(start)
800         // and modifying the list, dealing with adjacent & overlapping ranges.
801         UChar32 range[3] = { start, limit, UNICODESET_HIGH };
802         add(range, 2, 0);
803     } else if (start == end) {
804         add(start);
805     }
806     return *this;
807 }
808 
809 // #define DEBUG_US_ADD
810 
811 #ifdef DEBUG_US_ADD
812 #include <stdio.h>
dump(UChar32 c)813 void dump(UChar32 c) {
814     if (c <= 0xFF) {
815         printf("%c", (char)c);
816     } else {
817         printf("U+%04X", c);
818     }
819 }
dump(const UChar32 * list,int32_t len)820 void dump(const UChar32* list, int32_t len) {
821     printf("[");
822     for (int32_t i=0; i<len; ++i) {
823         if (i != 0) printf(", ");
824         dump(list[i]);
825     }
826     printf("]");
827 }
828 #endif
829 
830 /**
831  * Adds the specified character to this set if it is not already
832  * present.  If this set already contains the specified character,
833  * the call leaves this set unchanged.
834  */
add(UChar32 c)835 UnicodeSet& UnicodeSet::add(UChar32 c) {
836     // find smallest i such that c < list[i]
837     // if odd, then it is IN the set
838     // if even, then it is OUT of the set
839     int32_t i = findCodePoint(pinCodePoint(c));
840 
841     // already in set?
842     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
843 
844     // HIGH is 0x110000
845     // assert(list[len-1] == HIGH);
846 
847     // empty = [HIGH]
848     // [start_0, limit_0, start_1, limit_1, HIGH]
849 
850     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
851     //                             ^
852     //                             list[i]
853 
854     // i == 0 means c is before the first range
855 
856 #ifdef DEBUG_US_ADD
857     printf("Add of ");
858     dump(c);
859     printf(" found at %d", i);
860     printf(": ");
861     dump(list, len);
862     printf(" => ");
863 #endif
864 
865     if (c == list[i]-1) {
866         // c is before start of next range
867         list[i] = c;
868         // if we touched the HIGH mark, then add a new one
869         if (c == (UNICODESET_HIGH - 1)) {
870             if (!ensureCapacity(len+1)) {
871                 // ensureCapacity will mark the object as Bogus if OOM failure happens.
872                 return *this;
873             }
874             list[len++] = UNICODESET_HIGH;
875         }
876         if (i > 0 && c == list[i-1]) {
877             // collapse adjacent ranges
878 
879             // [..., start_k-1, c, c, limit_k, ..., HIGH]
880             //                     ^
881             //                     list[i]
882 
883             //for (int32_t k=i-1; k<len-2; ++k) {
884             //    list[k] = list[k+2];
885             //}
886             UChar32* dst = list + i - 1;
887             UChar32* src = dst + 2;
888             UChar32* srclimit = list + len;
889             while (src < srclimit) *(dst++) = *(src++);
890 
891             len -= 2;
892         }
893     }
894 
895     else if (i > 0 && c == list[i-1]) {
896         // c is after end of prior range
897         list[i-1]++;
898         // no need to check for collapse here
899     }
900 
901     else {
902         // At this point we know the new char is not adjacent to
903         // any existing ranges, and it is not 10FFFF.
904 
905 
906         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
907         //                             ^
908         //                             list[i]
909 
910         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
911         //                             ^
912         //                             list[i]
913 
914         if (!ensureCapacity(len+2)) {
915             // ensureCapacity will mark the object as Bogus if OOM failure happens.
916             return *this;
917         }
918 
919         UChar32 *p = list + i;
920         uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
921         list[i] = c;
922         list[i+1] = c+1;
923         len += 2;
924     }
925 
926 #ifdef DEBUG_US_ADD
927     dump(list, len);
928     printf("\n");
929 
930     for (i=1; i<len; ++i) {
931         if (list[i] <= list[i-1]) {
932             // Corrupt array!
933             printf("ERROR: list has been corrupted\n");
934             exit(1);
935         }
936     }
937 #endif
938 
939     releasePattern();
940     return *this;
941 }
942 
943 /**
944  * Adds the specified multicharacter to this set if it is not already
945  * present.  If this set already contains the multicharacter,
946  * the call leaves this set unchanged.
947  * Thus "ch" => {"ch"}
948  *
949  * @param s the source string
950  * @return the modified set, for chaining
951  */
add(const UnicodeString & s)952 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
953     if (isFrozen() || isBogus()) return *this;
954     int32_t cp = getSingleCP(s);
955     if (cp < 0) {
956         if (!stringsContains(s)) {
957             _add(s);
958             releasePattern();
959         }
960     } else {
961         add((UChar32)cp);
962     }
963     return *this;
964 }
965 
966 /**
967  * Adds the given string, in order, to 'strings'.  The given string
968  * must have been checked by the caller to not already be in 'strings'.
969  */
_add(const UnicodeString & s)970 void UnicodeSet::_add(const UnicodeString& s) {
971     if (isFrozen() || isBogus()) {
972         return;
973     }
974     UErrorCode ec = U_ZERO_ERROR;
975     if (strings == nullptr && !allocateStrings(ec)) {
976         setToBogus();
977         return;
978     }
979     UnicodeString* t = new UnicodeString(s);
980     if (t == nullptr) { // Check for memory allocation error.
981         setToBogus();
982         return;
983     }
984     strings->sortedInsert(t, compareUnicodeString, ec);
985     if (U_FAILURE(ec)) {
986         setToBogus();
987     }
988 }
989 
990 /**
991  * @return a code point IF the string consists of a single one.
992  * otherwise returns -1.
993  * @param string to test
994  */
getSingleCP(const UnicodeString & s)995 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
996     int32_t sLength = s.length();
997     if (sLength == 1) return s.charAt(0);
998     if (sLength == 2) {
999         UChar32 cp = s.char32At(0);
1000         if (cp > 0xFFFF) { // is surrogate pair
1001             return cp;
1002         }
1003     }
1004     return -1;
1005 }
1006 
1007 /**
1008  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1009  * If this set already any particular character, it has no effect on that character.
1010  * @param the source string
1011  * @return the modified set, for chaining
1012  */
addAll(const UnicodeString & s)1013 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1014     UChar32 cp;
1015     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1016         cp = s.char32At(i);
1017         add(cp);
1018     }
1019     return *this;
1020 }
1021 
1022 /**
1023  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1024  * If this set already any particular character, it has no effect on that character.
1025  * @param the source string
1026  * @return the modified set, for chaining
1027  */
retainAll(const UnicodeString & s)1028 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1029     UnicodeSet set;
1030     set.addAll(s);
1031     retainAll(set);
1032     return *this;
1033 }
1034 
1035 /**
1036  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1037  * If this set already any particular character, it has no effect on that character.
1038  * @param the source string
1039  * @return the modified set, for chaining
1040  */
complementAll(const UnicodeString & s)1041 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1042     UnicodeSet set;
1043     set.addAll(s);
1044     complementAll(set);
1045     return *this;
1046 }
1047 
1048 /**
1049  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1050  * If this set already any particular character, it has no effect on that character.
1051  * @param the source string
1052  * @return the modified set, for chaining
1053  */
removeAll(const UnicodeString & s)1054 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1055     UnicodeSet set;
1056     set.addAll(s);
1057     removeAll(set);
1058     return *this;
1059 }
1060 
removeAllStrings()1061 UnicodeSet& UnicodeSet::removeAllStrings() {
1062     if (!isFrozen() && hasStrings()) {
1063         strings->removeAllElements();
1064         releasePattern();
1065     }
1066     return *this;
1067 }
1068 
1069 
1070 /**
1071  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1072  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1073  * @param the source string
1074  * @return a newly created set containing the given string
1075  */
createFrom(const UnicodeString & s)1076 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1077     UnicodeSet *set = new UnicodeSet();
1078     if (set != nullptr) { // Check for memory allocation error.
1079         set->add(s);
1080     }
1081     return set;
1082 }
1083 
1084 
1085 /**
1086  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1087  * @param the source string
1088  * @return a newly created set containing the given characters
1089  */
createFromAll(const UnicodeString & s)1090 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1091     UnicodeSet *set = new UnicodeSet();
1092     if (set != nullptr) { // Check for memory allocation error.
1093         set->addAll(s);
1094     }
1095     return set;
1096 }
1097 
1098 /**
1099  * Retain only the elements in this set that are contained in the
1100  * specified range.  If <code>end > start</code> then an empty range is
1101  * retained, leaving the set empty.
1102  *
1103  * @param start first character, inclusive, of range to be retained
1104  * to this set.
1105  * @param end last character, inclusive, of range to be retained
1106  * to this set.
1107  */
retain(UChar32 start,UChar32 end)1108 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1109     if (pinCodePoint(start) <= pinCodePoint(end)) {
1110         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1111         retain(range, 2, 0);
1112     } else {
1113         clear();
1114     }
1115     return *this;
1116 }
1117 
retain(UChar32 c)1118 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1119     return retain(c, c);
1120 }
1121 
retain(const UnicodeString & s)1122 UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
1123     if (isFrozen() || isBogus()) { return *this; }
1124     UChar32 cp = getSingleCP(s);
1125     if (cp < 0) {
1126         bool isIn = stringsContains(s);
1127         // Check for getRangeCount() first to avoid somewhat-expensive size()
1128         // when there are single code points.
1129         if (isIn && getRangeCount() == 0 && size() == 1) {
1130             return *this;
1131         }
1132         clear();
1133         if (isIn) {
1134             _add(s);
1135         }
1136     } else {
1137         retain(cp, cp);
1138     }
1139     return *this;
1140 }
1141 
1142 /**
1143  * Removes the specified range from this set if it is present.
1144  * The set will not contain the specified range once the call
1145  * returns.  If <code>end > start</code> then an empty range is
1146  * removed, leaving the set unchanged.
1147  *
1148  * @param start first character, inclusive, of range to be removed
1149  * from this set.
1150  * @param end last character, inclusive, of range to be removed
1151  * from this set.
1152  */
remove(UChar32 start,UChar32 end)1153 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1154     if (pinCodePoint(start) <= pinCodePoint(end)) {
1155         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1156         retain(range, 2, 2);
1157     }
1158     return *this;
1159 }
1160 
1161 /**
1162  * Removes the specified character from this set if it is present.
1163  * The set will not contain the specified range once the call
1164  * returns.
1165  */
remove(UChar32 c)1166 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1167     return remove(c, c);
1168 }
1169 
1170 /**
1171  * Removes the specified string from this set if it is present.
1172  * The set will not contain the specified character once the call
1173  * returns.
1174  * @param the source string
1175  * @return the modified set, for chaining
1176  */
remove(const UnicodeString & s)1177 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1178     if (isFrozen() || isBogus()) return *this;
1179     int32_t cp = getSingleCP(s);
1180     if (cp < 0) {
1181         if (strings != nullptr && strings->removeElement((void*) &s)) {
1182             releasePattern();
1183         }
1184     } else {
1185         remove((UChar32)cp, (UChar32)cp);
1186     }
1187     return *this;
1188 }
1189 
1190 /**
1191  * Complements the specified range in this set.  Any character in
1192  * the range will be removed if it is in this set, or will be
1193  * added if it is not in this set.  If <code>end > start</code>
1194  * then an empty range is xor'ed, leaving the set unchanged.
1195  *
1196  * @param start first character, inclusive, of range to be removed
1197  * from this set.
1198  * @param end last character, inclusive, of range to be removed
1199  * from this set.
1200  */
complement(UChar32 start,UChar32 end)1201 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1202     if (isFrozen() || isBogus()) {
1203         return *this;
1204     }
1205     if (pinCodePoint(start) <= pinCodePoint(end)) {
1206         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1207         exclusiveOr(range, 2, 0);
1208     }
1209     releasePattern();
1210     return *this;
1211 }
1212 
complement(UChar32 c)1213 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1214     return complement(c, c);
1215 }
1216 
1217 /**
1218  * This is equivalent to
1219  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1220  */
complement()1221 UnicodeSet& UnicodeSet::complement() {
1222     if (isFrozen() || isBogus()) {
1223         return *this;
1224     }
1225     if (list[0] == UNICODESET_LOW) {
1226         uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
1227         --len;
1228     } else {
1229         if (!ensureCapacity(len+1)) {
1230             return *this;
1231         }
1232         uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
1233         list[0] = UNICODESET_LOW;
1234         ++len;
1235     }
1236     releasePattern();
1237     return *this;
1238 }
1239 
1240 /**
1241  * Complement the specified string in this set.
1242  * The set will not contain the specified string once the call
1243  * returns.
1244  *
1245  * @param s the string to complement
1246  * @return this object, for chaining
1247  */
complement(const UnicodeString & s)1248 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1249     if (isFrozen() || isBogus()) return *this;
1250     int32_t cp = getSingleCP(s);
1251     if (cp < 0) {
1252         if (stringsContains(s)) {
1253             strings->removeElement((void*) &s);
1254         } else {
1255             _add(s);
1256         }
1257         releasePattern();
1258     } else {
1259         complement((UChar32)cp, (UChar32)cp);
1260     }
1261     return *this;
1262 }
1263 
1264 /**
1265  * Adds all of the elements in the specified set to this set if
1266  * they're not already present.  This operation effectively
1267  * modifies this set so that its value is the <i>union</i> of the two
1268  * sets.  The behavior of this operation is unspecified if the specified
1269  * collection is modified while the operation is in progress.
1270  *
1271  * @param c set whose elements are to be added to this set.
1272  * @see #add(char, char)
1273  */
addAll(const UnicodeSet & c)1274 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1275     if ( c.len>0 && c.list!=nullptr ) {
1276         add(c.list, c.len, 0);
1277     }
1278 
1279     // Add strings in order
1280     if ( c.strings!=nullptr ) {
1281         for (int32_t i=0; i<c.strings->size(); ++i) {
1282             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1283             if (!stringsContains(*s)) {
1284                 _add(*s);
1285             }
1286         }
1287     }
1288     return *this;
1289 }
1290 
1291 /**
1292  * Retains only the elements in this set that are contained in the
1293  * specified set.  In other words, removes from this set all of
1294  * its elements that are not contained in the specified set.  This
1295  * operation effectively modifies this set so that its value is
1296  * the <i>intersection</i> of the two sets.
1297  *
1298  * @param c set that defines which elements this set will retain.
1299  */
retainAll(const UnicodeSet & c)1300 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1301     if (isFrozen() || isBogus()) {
1302         return *this;
1303     }
1304     retain(c.list, c.len, 0);
1305     if (hasStrings()) {
1306         if (!c.hasStrings()) {
1307             strings->removeAllElements();
1308         } else {
1309             strings->retainAll(*c.strings);
1310         }
1311     }
1312     return *this;
1313 }
1314 
1315 /**
1316  * Removes from this set all of its elements that are contained in the
1317  * specified set.  This operation effectively modifies this
1318  * set so that its value is the <i>asymmetric set difference</i> of
1319  * the two sets.
1320  *
1321  * @param c set that defines which elements will be removed from
1322  *          this set.
1323  */
removeAll(const UnicodeSet & c)1324 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1325     if (isFrozen() || isBogus()) {
1326         return *this;
1327     }
1328     retain(c.list, c.len, 2);
1329     if (hasStrings() && c.hasStrings()) {
1330         strings->removeAll(*c.strings);
1331     }
1332     return *this;
1333 }
1334 
1335 /**
1336  * Complements in this set all elements contained in the specified
1337  * set.  Any character in the other set will be removed if it is
1338  * in this set, or will be added if it is not in this set.
1339  *
1340  * @param c set that defines which elements will be xor'ed from
1341  *          this set.
1342  */
complementAll(const UnicodeSet & c)1343 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1344     if (isFrozen() || isBogus()) {
1345         return *this;
1346     }
1347     exclusiveOr(c.list, c.len, 0);
1348 
1349     if (c.strings != nullptr) {
1350         for (int32_t i=0; i<c.strings->size(); ++i) {
1351             void* e = c.strings->elementAt(i);
1352             if (strings == nullptr || !strings->removeElement(e)) {
1353                 _add(*(const UnicodeString*)e);
1354             }
1355         }
1356     }
1357     return *this;
1358 }
1359 
1360 /**
1361  * Removes all of the elements from this set.  This set will be
1362  * empty after this call returns.
1363  */
clear()1364 UnicodeSet& UnicodeSet::clear() {
1365     if (isFrozen()) {
1366         return *this;
1367     }
1368     list[0] = UNICODESET_HIGH;
1369     len = 1;
1370     releasePattern();
1371     if (strings != nullptr) {
1372         strings->removeAllElements();
1373     }
1374     // Remove bogus
1375     fFlags = 0;
1376     return *this;
1377 }
1378 
1379 /**
1380  * Iteration method that returns the number of ranges contained in
1381  * this set.
1382  * @see #getRangeStart
1383  * @see #getRangeEnd
1384  */
getRangeCount() const1385 int32_t UnicodeSet::getRangeCount() const {
1386     return len/2;
1387 }
1388 
1389 /**
1390  * Iteration method that returns the first character in the
1391  * specified range of this set.
1392  * @see #getRangeCount
1393  * @see #getRangeEnd
1394  */
getRangeStart(int32_t index) const1395 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1396     return list[index*2];
1397 }
1398 
1399 /**
1400  * Iteration method that returns the last character in the
1401  * specified range of this set.
1402  * @see #getRangeStart
1403  * @see #getRangeEnd
1404  */
getRangeEnd(int32_t index) const1405 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1406     return list[index*2 + 1] - 1;
1407 }
1408 
getString(int32_t index) const1409 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1410     return (const UnicodeString*) strings->elementAt(index);
1411 }
1412 
1413 /**
1414  * Reallocate this objects internal structures to take up the least
1415  * possible space, without changing this object's value.
1416  */
compact()1417 UnicodeSet& UnicodeSet::compact() {
1418     if (isFrozen() || isBogus()) {
1419         return *this;
1420     }
1421     // Delete buffer first to defragment memory less.
1422     if (buffer != stackList) {
1423         uprv_free(buffer);
1424         buffer = nullptr;
1425         bufferCapacity = 0;
1426     }
1427     if (list == stackList) {
1428         // pass
1429     } else if (len <= INITIAL_CAPACITY) {
1430         uprv_memcpy(stackList, list, len * sizeof(UChar32));
1431         uprv_free(list);
1432         list = stackList;
1433         capacity = INITIAL_CAPACITY;
1434     } else if ((len + 7) < capacity) {
1435         // If we have more than a little unused capacity, shrink it to len.
1436         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);
1437         if (temp) {
1438             list = temp;
1439             capacity = len;
1440         }
1441         // else what the heck happened?! We allocated less memory!
1442         // Oh well. We'll keep our original array.
1443     }
1444     if (strings != nullptr && strings->isEmpty()) {
1445         delete strings;
1446         strings = nullptr;
1447     }
1448     return *this;
1449 }
1450 
1451 #ifdef DEBUG_SERIALIZE
1452 #include <stdio.h>
1453 #endif
1454 
1455 /**
1456  * Deserialize constructor.
1457  */
UnicodeSet(const uint16_t data[],int32_t dataLen,ESerialization serialization,UErrorCode & ec)1458 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1459                        UErrorCode &ec) {
1460 
1461   if(U_FAILURE(ec)) {
1462     setToBogus();
1463     return;
1464   }
1465 
1466   if( (serialization != kSerialized)
1467       || (data==nullptr)
1468       || (dataLen < 1)) {
1469     ec = U_ILLEGAL_ARGUMENT_ERROR;
1470     setToBogus();
1471     return;
1472   }
1473 
1474   // bmp?
1475   int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1476   int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1477 
1478   int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1479 #ifdef DEBUG_SERIALIZE
1480   printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
1481 #endif
1482   if(!ensureCapacity(newLength + 1)) {  // +1 for HIGH
1483     return;
1484   }
1485   // copy bmp
1486   int32_t i;
1487   for(i = 0; i< bmpLength;i++) {
1488     list[i] = data[i+headerSize];
1489 #ifdef DEBUG_SERIALIZE
1490     printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1491 #endif
1492   }
1493   // copy smp
1494   for(i=bmpLength;i<newLength;i++) {
1495     list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1496               ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1497 #ifdef DEBUG_SERIALIZE
1498     printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1499 #endif
1500   }
1501   U_ASSERT(i == newLength);
1502   if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
1503     list[i++] = UNICODESET_HIGH;
1504   }
1505   len = i;
1506 }
1507 
1508 
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1509 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1510     int32_t bmpLength, length, destLength;
1511 
1512     if (U_FAILURE(ec)) {
1513         return 0;
1514     }
1515 
1516     if (destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
1517         ec=U_ILLEGAL_ARGUMENT_ERROR;
1518         return 0;
1519     }
1520 
1521     /* count necessary 16-bit units */
1522     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1523     // assert(length>=0);
1524     if (length==0) {
1525         /* empty set */
1526         if (destCapacity>0) {
1527             *dest=0;
1528         } else {
1529             ec=U_BUFFER_OVERFLOW_ERROR;
1530         }
1531         return 1;
1532     }
1533     /* now length>0 */
1534 
1535     if (this->list[length-1]<=0xffff) {
1536         /* all BMP */
1537         bmpLength=length;
1538     } else if (this->list[0]>=0x10000) {
1539         /* all supplementary */
1540         bmpLength=0;
1541         length*=2;
1542     } else {
1543         /* some BMP, some supplementary */
1544         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1545         length=bmpLength+2*(length-bmpLength);
1546     }
1547 #ifdef DEBUG_SERIALIZE
1548     printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1549 #endif
1550     /* length: number of 16-bit array units */
1551     if (length>0x7fff) {
1552         /* there are only 15 bits for the length in the first serialized word */
1553         ec=U_INDEX_OUTOFBOUNDS_ERROR;
1554         return 0;
1555     }
1556 
1557     /*
1558      * total serialized length:
1559      * number of 16-bit array units (length) +
1560      * 1 length unit (always) +
1561      * 1 bmpLength unit (if there are supplementary values)
1562      */
1563     destLength=length+((length>bmpLength)?2:1);
1564     if (destLength<=destCapacity) {
1565         const UChar32 *p;
1566         int32_t i;
1567 
1568 #ifdef DEBUG_SERIALIZE
1569         printf("writeHdr\n");
1570 #endif
1571         *dest=(uint16_t)length;
1572         if (length>bmpLength) {
1573             *dest|=0x8000;
1574             *++dest=(uint16_t)bmpLength;
1575         }
1576         ++dest;
1577 
1578         /* write the BMP part of the array */
1579         p=this->list;
1580         for (i=0; i<bmpLength; ++i) {
1581 #ifdef DEBUG_SERIALIZE
1582           printf("writebmp: %x\n", (int)*p);
1583 #endif
1584             *dest++=(uint16_t)*p++;
1585         }
1586 
1587         /* write the supplementary part of the array */
1588         for (; i<length; i+=2) {
1589 #ifdef DEBUG_SERIALIZE
1590           printf("write32: %x\n", (int)*p);
1591 #endif
1592             *dest++=(uint16_t)(*p>>16);
1593             *dest++=(uint16_t)*p++;
1594         }
1595     } else {
1596         ec=U_BUFFER_OVERFLOW_ERROR;
1597     }
1598     return destLength;
1599 }
1600 
1601 //----------------------------------------------------------------
1602 // Implementation: Utility methods
1603 //----------------------------------------------------------------
1604 
1605 /**
1606  * Allocate our strings vector and return true if successful.
1607  */
allocateStrings(UErrorCode & status)1608 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1609     if (U_FAILURE(status)) {
1610         return false;
1611     }
1612     strings = new UVector(uprv_deleteUObject,
1613                           uhash_compareUnicodeString, 1, status);
1614     if (strings == nullptr) { // Check for memory allocation error.
1615         status = U_MEMORY_ALLOCATION_ERROR;
1616         return false;
1617     }
1618     if (U_FAILURE(status)) {
1619         delete strings;
1620         strings = nullptr;
1621         return false;
1622     }
1623     return true;
1624 }
1625 
nextCapacity(int32_t minCapacity)1626 int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1627     // Grow exponentially to reduce the frequency of allocations.
1628     if (minCapacity < INITIAL_CAPACITY) {
1629         return minCapacity + INITIAL_CAPACITY;
1630     } else if (minCapacity <= 2500) {
1631         return 5 * minCapacity;
1632     } else {
1633         int32_t newCapacity = 2 * minCapacity;
1634         if (newCapacity > MAX_LENGTH) {
1635             newCapacity = MAX_LENGTH;
1636         }
1637         return newCapacity;
1638     }
1639 }
1640 
ensureCapacity(int32_t newLen)1641 bool UnicodeSet::ensureCapacity(int32_t newLen) {
1642     if (newLen > MAX_LENGTH) {
1643         newLen = MAX_LENGTH;
1644     }
1645     if (newLen <= capacity) {
1646         return true;
1647     }
1648     int32_t newCapacity = nextCapacity(newLen);
1649     UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1650     if (temp == nullptr) {
1651         setToBogus(); // set the object to bogus state if an OOM failure occurred.
1652         return false;
1653     }
1654     // Copy only the actual contents.
1655     uprv_memcpy(temp, list, len * sizeof(UChar32));
1656     if (list != stackList) {
1657         uprv_free(list);
1658     }
1659     list = temp;
1660     capacity = newCapacity;
1661     return true;
1662 }
1663 
ensureBufferCapacity(int32_t newLen)1664 bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1665     if (newLen > MAX_LENGTH) {
1666         newLen = MAX_LENGTH;
1667     }
1668     if (newLen <= bufferCapacity) {
1669         return true;
1670     }
1671     int32_t newCapacity = nextCapacity(newLen);
1672     UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1673     if (temp == nullptr) {
1674         setToBogus();
1675         return false;
1676     }
1677     // The buffer has no contents to be copied.
1678     // It is always filled from scratch after this call.
1679     if (buffer != stackList) {
1680         uprv_free(buffer);
1681     }
1682     buffer = temp;
1683     bufferCapacity = newCapacity;
1684     return true;
1685 }
1686 
1687 /**
1688  * Swap list and buffer.
1689  */
swapBuffers()1690 void UnicodeSet::swapBuffers() {
1691     // swap list and buffer
1692     UChar32* temp = list;
1693     list = buffer;
1694     buffer = temp;
1695 
1696     int32_t c = capacity;
1697     capacity = bufferCapacity;
1698     bufferCapacity = c;
1699 }
1700 
setToBogus()1701 void UnicodeSet::setToBogus() {
1702     clear(); // Remove everything in the set.
1703     fFlags = kIsBogus;
1704 }
1705 
1706 //----------------------------------------------------------------
1707 // Implementation: Fundamental operators
1708 //----------------------------------------------------------------
1709 
max(UChar32 a,UChar32 b)1710 static inline UChar32 max(UChar32 a, UChar32 b) {
1711     return (a > b) ? a : b;
1712 }
1713 
1714 // polarity = 0, 3 is normal: x xor y
1715 // polarity = 1, 2: x xor ~y == x === y
1716 
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1717 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1718     if (isFrozen() || isBogus()) {
1719         return;
1720     }
1721     if (!ensureBufferCapacity(len + otherLen)) {
1722         return;
1723     }
1724 
1725     int32_t i = 0, j = 0, k = 0;
1726     UChar32 a = list[i++];
1727     UChar32 b;
1728     if (polarity == 1 || polarity == 2) {
1729         b = UNICODESET_LOW;
1730         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1731             ++j;
1732             b = other[j];
1733         }
1734     } else {
1735         b = other[j++];
1736     }
1737     // simplest of all the routines
1738     // sort the values, discarding identicals!
1739     for (;;) {
1740         if (a < b) {
1741             buffer[k++] = a;
1742             a = list[i++];
1743         } else if (b < a) {
1744             buffer[k++] = b;
1745             b = other[j++];
1746         } else if (a != UNICODESET_HIGH) { // at this point, a == b
1747             // discard both values!
1748             a = list[i++];
1749             b = other[j++];
1750         } else { // DONE!
1751             buffer[k++] = UNICODESET_HIGH;
1752             len = k;
1753             break;
1754         }
1755     }
1756     swapBuffers();
1757     releasePattern();
1758 }
1759 
1760 // polarity = 0 is normal: x union y
1761 // polarity = 2: x union ~y
1762 // polarity = 1: ~x union y
1763 // polarity = 3: ~x union ~y
1764 
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1765 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1766     if (isFrozen() || isBogus() || other==nullptr) {
1767         return;
1768     }
1769     if (!ensureBufferCapacity(len + otherLen)) {
1770         return;
1771     }
1772 
1773     int32_t i = 0, j = 0, k = 0;
1774     UChar32 a = list[i++];
1775     UChar32 b = other[j++];
1776     // change from xor is that we have to check overlapping pairs
1777     // polarity bit 1 means a is second, bit 2 means b is.
1778     for (;;) {
1779         switch (polarity) {
1780           case 0: // both first; take lower if unequal
1781             if (a < b) { // take a
1782                 // Back up over overlapping ranges in buffer[]
1783                 if (k > 0 && a <= buffer[k-1]) {
1784                     // Pick latter end value in buffer[] vs. list[]
1785                     a = max(list[i], buffer[--k]);
1786                 } else {
1787                     // No overlap
1788                     buffer[k++] = a;
1789                     a = list[i];
1790                 }
1791                 i++; // Common if/else code factored out
1792                 polarity ^= 1;
1793             } else if (b < a) { // take b
1794                 if (k > 0 && b <= buffer[k-1]) {
1795                     b = max(other[j], buffer[--k]);
1796                 } else {
1797                     buffer[k++] = b;
1798                     b = other[j];
1799                 }
1800                 j++;
1801                 polarity ^= 2;
1802             } else { // a == b, take a, drop b
1803                 if (a == UNICODESET_HIGH) goto loop_end;
1804                 // This is symmetrical; it doesn't matter if
1805                 // we backtrack with a or b. - liu
1806                 if (k > 0 && a <= buffer[k-1]) {
1807                     a = max(list[i], buffer[--k]);
1808                 } else {
1809                     // No overlap
1810                     buffer[k++] = a;
1811                     a = list[i];
1812                 }
1813                 i++;
1814                 polarity ^= 1;
1815                 b = other[j++];
1816                 polarity ^= 2;
1817             }
1818             break;
1819           case 3: // both second; take higher if unequal, and drop other
1820             if (b <= a) { // take a
1821                 if (a == UNICODESET_HIGH) goto loop_end;
1822                 buffer[k++] = a;
1823             } else { // take b
1824                 if (b == UNICODESET_HIGH) goto loop_end;
1825                 buffer[k++] = b;
1826             }
1827             a = list[i++];
1828             polarity ^= 1;   // factored common code
1829             b = other[j++];
1830             polarity ^= 2;
1831             break;
1832           case 1: // a second, b first; if b < a, overlap
1833             if (a < b) { // no overlap, take a
1834                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1835             } else if (b < a) { // OVERLAP, drop b
1836                 b = other[j++];
1837                 polarity ^= 2;
1838             } else { // a == b, drop both!
1839                 if (a == UNICODESET_HIGH) goto loop_end;
1840                 a = list[i++];
1841                 polarity ^= 1;
1842                 b = other[j++];
1843                 polarity ^= 2;
1844             }
1845             break;
1846           case 2: // a first, b second; if a < b, overlap
1847             if (b < a) { // no overlap, take b
1848                 buffer[k++] = b;
1849                 b = other[j++];
1850                 polarity ^= 2;
1851             } else  if (a < b) { // OVERLAP, drop a
1852                 a = list[i++];
1853                 polarity ^= 1;
1854             } else { // a == b, drop both!
1855                 if (a == UNICODESET_HIGH) goto loop_end;
1856                 a = list[i++];
1857                 polarity ^= 1;
1858                 b = other[j++];
1859                 polarity ^= 2;
1860             }
1861             break;
1862         }
1863     }
1864  loop_end:
1865     buffer[k++] = UNICODESET_HIGH;    // terminate
1866     len = k;
1867     swapBuffers();
1868     releasePattern();
1869 }
1870 
1871 // polarity = 0 is normal: x intersect y
1872 // polarity = 2: x intersect ~y == set-minus
1873 // polarity = 1: ~x intersect y
1874 // polarity = 3: ~x intersect ~y
1875 
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1876 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1877     if (isFrozen() || isBogus()) {
1878         return;
1879     }
1880     if (!ensureBufferCapacity(len + otherLen)) {
1881         return;
1882     }
1883 
1884     int32_t i = 0, j = 0, k = 0;
1885     UChar32 a = list[i++];
1886     UChar32 b = other[j++];
1887     // change from xor is that we have to check overlapping pairs
1888     // polarity bit 1 means a is second, bit 2 means b is.
1889     for (;;) {
1890         switch (polarity) {
1891           case 0: // both first; drop the smaller
1892             if (a < b) { // drop a
1893                 a = list[i++];
1894                 polarity ^= 1;
1895             } else if (b < a) { // drop b
1896                 b = other[j++];
1897                 polarity ^= 2;
1898             } else { // a == b, take one, drop other
1899                 if (a == UNICODESET_HIGH) goto loop_end;
1900                 buffer[k++] = a;
1901                 a = list[i++];
1902                 polarity ^= 1;
1903                 b = other[j++];
1904                 polarity ^= 2;
1905             }
1906             break;
1907           case 3: // both second; take lower if unequal
1908             if (a < b) { // take a
1909                 buffer[k++] = a;
1910                 a = list[i++];
1911                 polarity ^= 1;
1912             } else if (b < a) { // take b
1913                 buffer[k++] = b;
1914                 b = other[j++];
1915                 polarity ^= 2;
1916             } else { // a == b, take one, drop other
1917                 if (a == UNICODESET_HIGH) goto loop_end;
1918                 buffer[k++] = a;
1919                 a = list[i++];
1920                 polarity ^= 1;
1921                 b = other[j++];
1922                 polarity ^= 2;
1923             }
1924             break;
1925           case 1: // a second, b first;
1926             if (a < b) { // NO OVERLAP, drop a
1927                 a = list[i++];
1928                 polarity ^= 1;
1929             } else if (b < a) { // OVERLAP, take b
1930                 buffer[k++] = b;
1931                 b = other[j++];
1932                 polarity ^= 2;
1933             } else { // a == b, drop both!
1934                 if (a == UNICODESET_HIGH) goto loop_end;
1935                 a = list[i++];
1936                 polarity ^= 1;
1937                 b = other[j++];
1938                 polarity ^= 2;
1939             }
1940             break;
1941           case 2: // a first, b second; if a < b, overlap
1942             if (b < a) { // no overlap, drop b
1943                 b = other[j++];
1944                 polarity ^= 2;
1945             } else  if (a < b) { // OVERLAP, take a
1946                 buffer[k++] = a;
1947                 a = list[i++];
1948                 polarity ^= 1;
1949             } else { // a == b, drop both!
1950                 if (a == UNICODESET_HIGH) goto loop_end;
1951                 a = list[i++];
1952                 polarity ^= 1;
1953                 b = other[j++];
1954                 polarity ^= 2;
1955             }
1956             break;
1957         }
1958     }
1959  loop_end:
1960     buffer[k++] = UNICODESET_HIGH;    // terminate
1961     len = k;
1962     swapBuffers();
1963     releasePattern();
1964 }
1965 
1966 /**
1967  * Append the <code>toPattern()</code> representation of a
1968  * string to the given <code>StringBuffer</code>.
1969  */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1970 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
1971     UChar32 cp;
1972     for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1973         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1974     }
1975 }
1976 
1977 /**
1978  * Append the <code>toPattern()</code> representation of a
1979  * character to the given <code>StringBuffer</code>.
1980  */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1981 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
1982     if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
1983         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1984         // unprintable
1985         ICU_Utility::escape(buf, c);
1986         return;
1987     }
1988     // Okay to let ':' pass through
1989     switch (c) {
1990     case u'[':
1991     case u']':
1992     case u'-':
1993     case u'^':
1994     case u'&':
1995     case u'\\':
1996     case u'{':
1997     case u'}':
1998     case u':':
1999     case SymbolTable::SYMBOL_REF:
2000         buf.append(u'\\');
2001         break;
2002     default:
2003         // Escape whitespace
2004         if (PatternProps::isWhiteSpace(c)) {
2005             buf.append(u'\\');
2006         }
2007         break;
2008     }
2009     buf.append(c);
2010 }
2011 
_appendToPat(UnicodeString & result,UChar32 start,UChar32 end,UBool escapeUnprintable)2012 void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
2013                               UBool escapeUnprintable) {
2014     _appendToPat(result, start, escapeUnprintable);
2015     if (start != end) {
2016         if ((start+1) != end ||
2017                 // Avoid writing what looks like a lead+trail surrogate pair.
2018                 start == 0xdbff) {
2019             result.append(u'-');
2020         }
2021         _appendToPat(result, end, escapeUnprintable);
2022     }
2023 }
2024 
2025 /**
2026  * Append a string representation of this set to result.  This will be
2027  * a cleaned version of the string passed to applyPattern(), if there
2028  * is one.  Otherwise it will be generated.
2029  */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const2030 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2031                                       UBool escapeUnprintable) const
2032 {
2033     if (pat != nullptr) {
2034         int32_t i;
2035         int32_t backslashCount = 0;
2036         for (i=0; i<patLen; ) {
2037             UChar32 c;
2038             U16_NEXT(pat, i, patLen, c);
2039             if (escapeUnprintable ?
2040                     ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
2041                 // If the unprintable character is preceded by an odd
2042                 // number of backslashes, then it has been escaped.
2043                 // Before unescaping it, we delete the final
2044                 // backslash.
2045                 if ((backslashCount % 2) == 1) {
2046                     result.truncate(result.length() - 1);
2047                 }
2048                 ICU_Utility::escape(result, c);
2049                 backslashCount = 0;
2050             } else {
2051                 result.append(c);
2052                 if (c == u'\\') {
2053                     ++backslashCount;
2054                 } else {
2055                     backslashCount = 0;
2056                 }
2057             }
2058         }
2059         return result;
2060     }
2061 
2062     return _generatePattern(result, escapeUnprintable);
2063 }
2064 
2065 /**
2066  * Returns a string representation of this set.  If the result of
2067  * calling this function is passed to a UnicodeSet constructor, it
2068  * will produce another set that is equal to this one.
2069  */
toPattern(UnicodeString & result,UBool escapeUnprintable) const2070 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2071                                      UBool escapeUnprintable) const
2072 {
2073     result.truncate(0);
2074     return _toPattern(result, escapeUnprintable);
2075 }
2076 
2077 /**
2078  * Generate and append a string representation of this set to result.
2079  * This does not use this.pat, the cleaned up copy of the string
2080  * passed to applyPattern().
2081  */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const2082 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2083                                             UBool escapeUnprintable) const
2084 {
2085     result.append(u'[');
2086 
2087     int32_t i = 0;
2088     int32_t limit = len & ~1;  // = 2 * getRangeCount()
2089 
2090     // If the set contains at least 2 intervals and includes both
2091     // MIN_VALUE and MAX_VALUE, then the inverse representation will
2092     // be more economical.
2093     //     if (getRangeCount() >= 2 &&
2094     //             getRangeStart(0) == MIN_VALUE &&
2095     //             getRangeEnd(last) == MAX_VALUE)
2096     // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
2097     // If limit == len then len is even and the last range ends with MAX_VALUE.
2098     //
2099     // *But* do not write the inverse (complement) if there are strings.
2100     // Since ICU 70, the '^' performs a code point complement which removes all strings.
2101     if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
2102         // Emit the inverse
2103         result.append(u'^');
2104         // Offsetting the inversion list index by one lets us
2105         // iterate over the ranges of the set complement.
2106         i = 1;
2107         --limit;
2108     }
2109 
2110     // Emit the ranges as pairs.
2111     while (i < limit) {
2112         UChar32 start = list[i];  // getRangeStart()
2113         UChar32 end = list[i + 1] - 1;  // getRangeEnd() = range limit minus one
2114         if (!(0xd800 <= end && end <= 0xdbff)) {
2115             _appendToPat(result, start, end, escapeUnprintable);
2116             i += 2;
2117         } else {
2118             // The range ends with a lead surrogate.
2119             // Avoid writing what looks like a lead+trail surrogate pair.
2120             // 1. Postpone ranges that start with a lead surrogate code point.
2121             int32_t firstLead = i;
2122             while ((i += 2) < limit && list[i] <= 0xdbff) {}
2123             int32_t firstAfterLead = i;
2124             // 2. Write following ranges that start with a trail surrogate code point.
2125             while (i < limit && (start = list[i]) <= 0xdfff) {
2126                 _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
2127                 i += 2;
2128             }
2129             // 3. Now write the postponed ranges.
2130             for (int j = firstLead; j < firstAfterLead; j += 2) {
2131                 _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
2132             }
2133         }
2134     }
2135 
2136     if (strings != nullptr) {
2137         for (int32_t i = 0; i<strings->size(); ++i) {
2138             result.append(u'{');
2139             _appendToPat(result,
2140                          *(const UnicodeString*) strings->elementAt(i),
2141                          escapeUnprintable);
2142             result.append(u'}');
2143         }
2144     }
2145     return result.append(u']');
2146 }
2147 
2148 /**
2149 * Release existing cached pattern
2150 */
releasePattern()2151 void UnicodeSet::releasePattern() {
2152     if (pat) {
2153         uprv_free(pat);
2154         pat = nullptr;
2155         patLen = 0;
2156     }
2157 }
2158 
2159 /**
2160 * Set the new pattern to cache.
2161 */
setPattern(const char16_t * newPat,int32_t newPatLen)2162 void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2163     releasePattern();
2164     pat = (char16_t *)uprv_malloc((newPatLen + 1) * sizeof(char16_t));
2165     if (pat) {
2166         patLen = newPatLen;
2167         u_memcpy(pat, newPat, patLen);
2168         pat[patLen] = 0;
2169     }
2170     // else we don't care if malloc failed. This was just a nice cache.
2171     // We can regenerate an equivalent pattern later when requested.
2172 }
2173 
freeze()2174 UnicodeSet *UnicodeSet::freeze() {
2175     if(!isFrozen() && !isBogus()) {
2176         compact();
2177 
2178         // Optimize contains() and span() and similar functions.
2179         if (hasStrings()) {
2180             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2181             if (stringSpan == nullptr) {
2182                 setToBogus();
2183                 return this;
2184             } else if (!stringSpan->needsStringSpanUTF16()) {
2185                 // All strings are irrelevant for span() etc. because
2186                 // all of each string's code points are contained in this set.
2187                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2188                 // many relevant strings as UTF-16.
2189                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2190                 delete stringSpan;
2191                 stringSpan = nullptr;
2192             }
2193         }
2194         if (stringSpan == nullptr) {
2195             // No span-relevant strings: Optimize for code point spans.
2196             bmpSet=new BMPSet(list, len);
2197             if (bmpSet == nullptr) { // Check for memory allocation error.
2198                 setToBogus();
2199             }
2200         }
2201     }
2202     return this;
2203 }
2204 
span(const char16_t * s,int32_t length,USetSpanCondition spanCondition) const2205 int32_t UnicodeSet::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
2206     if(length>0 && bmpSet!=nullptr) {
2207         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2208     }
2209     if(length<0) {
2210         length=u_strlen(s);
2211     }
2212     if(length==0) {
2213         return 0;
2214     }
2215     if(stringSpan!=nullptr) {
2216         return stringSpan->span(s, length, spanCondition);
2217     } else if(hasStrings()) {
2218         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2219                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2220                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2221         UnicodeSetStringSpan strSpan(*this, *strings, which);
2222         if(strSpan.needsStringSpanUTF16()) {
2223             return strSpan.span(s, length, spanCondition);
2224         }
2225     }
2226 
2227     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2228         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2229     }
2230 
2231     UChar32 c;
2232     int32_t start=0, prev=0;
2233     do {
2234         U16_NEXT(s, start, length, c);
2235         if(spanCondition!=contains(c)) {
2236             break;
2237         }
2238     } while((prev=start)<length);
2239     return prev;
2240 }
2241 
spanBack(const char16_t * s,int32_t length,USetSpanCondition spanCondition) const2242 int32_t UnicodeSet::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
2243     if(length>0 && bmpSet!=nullptr) {
2244         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2245     }
2246     if(length<0) {
2247         length=u_strlen(s);
2248     }
2249     if(length==0) {
2250         return 0;
2251     }
2252     if(stringSpan!=nullptr) {
2253         return stringSpan->spanBack(s, length, spanCondition);
2254     } else if(hasStrings()) {
2255         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2256                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2257                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2258         UnicodeSetStringSpan strSpan(*this, *strings, which);
2259         if(strSpan.needsStringSpanUTF16()) {
2260             return strSpan.spanBack(s, length, spanCondition);
2261         }
2262     }
2263 
2264     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2265         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2266     }
2267 
2268     UChar32 c;
2269     int32_t prev=length;
2270     do {
2271         U16_PREV(s, 0, length, c);
2272         if(spanCondition!=contains(c)) {
2273             break;
2274         }
2275     } while((prev=length)>0);
2276     return prev;
2277 }
2278 
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2279 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2280     if(length>0 && bmpSet!=nullptr) {
2281         const uint8_t *s0=(const uint8_t *)s;
2282         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2283     }
2284     if(length<0) {
2285         length=(int32_t)uprv_strlen(s);
2286     }
2287     if(length==0) {
2288         return 0;
2289     }
2290     if(stringSpan!=nullptr) {
2291         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2292     } else if(hasStrings()) {
2293         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2294                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2295                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2296         UnicodeSetStringSpan strSpan(*this, *strings, which);
2297         if(strSpan.needsStringSpanUTF8()) {
2298             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2299         }
2300     }
2301 
2302     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2303         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2304     }
2305 
2306     UChar32 c;
2307     int32_t start=0, prev=0;
2308     do {
2309         U8_NEXT_OR_FFFD(s, start, length, c);
2310         if(spanCondition!=contains(c)) {
2311             break;
2312         }
2313     } while((prev=start)<length);
2314     return prev;
2315 }
2316 
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2317 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2318     if(length>0 && bmpSet!=nullptr) {
2319         const uint8_t *s0=(const uint8_t *)s;
2320         return bmpSet->spanBackUTF8(s0, length, spanCondition);
2321     }
2322     if(length<0) {
2323         length=(int32_t)uprv_strlen(s);
2324     }
2325     if(length==0) {
2326         return 0;
2327     }
2328     if(stringSpan!=nullptr) {
2329         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2330     } else if(hasStrings()) {
2331         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2332                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2333                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2334         UnicodeSetStringSpan strSpan(*this, *strings, which);
2335         if(strSpan.needsStringSpanUTF8()) {
2336             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2337         }
2338     }
2339 
2340     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2341         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2342     }
2343 
2344     UChar32 c;
2345     int32_t prev=length;
2346     do {
2347         U8_PREV_OR_FFFD(s, 0, length, c);
2348         if(spanCondition!=contains(c)) {
2349             break;
2350         }
2351     } while((prev=length)>0);
2352     return prev;
2353 }
2354 
2355 U_NAMESPACE_END
2356