• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 **********************************************************************
3 *   Copyright (C) 1999-2009, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 *   Date        Name        Description
7 *   10/20/99    alan        Creation.
8 **********************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/parsepos.h"
14 #include "unicode/symtable.h"
15 #include "ruleiter.h"
16 #include "cmemory.h"
17 #include "cstring.h"
18 #include "uhash.h"
19 #include "util.h"
20 #include "uvector.h"
21 #include "charstr.h"
22 #include "ustrfmt.h"
23 #include "uassert.h"
24 #include "hash.h"
25 #include "bmpset.h"
26 #include "unisetspan.h"
27 
28 // Define UChar constants using hex for EBCDIC compatibility
29 // Used #define to reduce private static exports and memory access time.
30 #define SET_OPEN        ((UChar)0x005B) /*[*/
31 #define SET_CLOSE       ((UChar)0x005D) /*]*/
32 #define HYPHEN          ((UChar)0x002D) /*-*/
33 #define COMPLEMENT      ((UChar)0x005E) /*^*/
34 #define COLON           ((UChar)0x003A) /*:*/
35 #define BACKSLASH       ((UChar)0x005C) /*\*/
36 #define INTERSECTION    ((UChar)0x0026) /*&*/
37 #define UPPER_U         ((UChar)0x0055) /*U*/
38 #define LOWER_U         ((UChar)0x0075) /*u*/
39 #define OPEN_BRACE      ((UChar)123)    /*{*/
40 #define CLOSE_BRACE     ((UChar)125)    /*}*/
41 #define UPPER_P         ((UChar)0x0050) /*P*/
42 #define LOWER_P         ((UChar)0x0070) /*p*/
43 #define UPPER_N         ((UChar)78)     /*N*/
44 #define EQUALS          ((UChar)0x003D) /*=*/
45 
46 // HIGH_VALUE > all valid values. 110000 for codepoints
47 #define UNICODESET_HIGH 0x0110000
48 
49 // LOW <= all valid values. ZERO for codepoints
50 #define UNICODESET_LOW 0x000000
51 
52 // initial storage. Must be >= 0
53 #define START_EXTRA 16
54 
55 // extra amount for growth. Must be >= 0
56 #define GROW_EXTRA START_EXTRA
57 
58 U_NAMESPACE_BEGIN
59 
~SymbolTable()60 SymbolTable::~SymbolTable() {}
61 
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)62 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
63 
64 /**
65  * Modify the given UChar32 variable so that it is in range, by
66  * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
67  * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
68  * It modifies its argument in-place and also returns it.
69  */
70 static inline UChar32 pinCodePoint(UChar32& c) {
71     if (c < UNICODESET_LOW) {
72         c = UNICODESET_LOW;
73     } else if (c > (UNICODESET_HIGH-1)) {
74         c = (UNICODESET_HIGH-1);
75     }
76     return c;
77 }
78 
79 //----------------------------------------------------------------
80 // Debugging
81 //----------------------------------------------------------------
82 
83 // DO NOT DELETE THIS CODE.  This code is used to debug memory leaks.
84 // To enable the debugging, define the symbol DEBUG_MEM in the line
85 // below.  This will result in text being sent to stdout that looks
86 // like this:
87 //   DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
88 //   DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
89 // Each line lists a construction (ct) or destruction (dt) event, the
90 // object address, the number of outstanding objects after the event,
91 // and the pattern of the object in question.
92 
93 // #define DEBUG_MEM
94 
95 #ifdef DEBUG_MEM
96 #include <stdio.h>
97 static int32_t _dbgCount = 0;
98 
_dbgct(UnicodeSet * set)99 static inline void _dbgct(UnicodeSet* set) {
100     UnicodeString str;
101     set->toPattern(str, TRUE);
102     char buf[40];
103     str.extract(0, 39, buf, "");
104     printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
105 }
106 
_dbgdt(UnicodeSet * set)107 static inline void _dbgdt(UnicodeSet* set) {
108     UnicodeString str;
109     set->toPattern(str, TRUE);
110     char buf[40];
111     str.extract(0, 39, buf, "");
112     printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
113 }
114 
115 #else
116 
117 #define _dbgct(set)
118 #define _dbgdt(set)
119 
120 #endif
121 
122 //----------------------------------------------------------------
123 // UnicodeString in UVector support
124 //----------------------------------------------------------------
125 
cloneUnicodeString(UHashTok * dst,UHashTok * src)126 static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) {
127     dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
128 }
129 
compareUnicodeString(UHashTok t1,UHashTok t2)130 static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {
131     const UnicodeString &a = *(const UnicodeString*)t1.pointer;
132     const UnicodeString &b = *(const UnicodeString*)t2.pointer;
133     return a.compare(b);
134 }
135 
136 //----------------------------------------------------------------
137 // Constructors &c
138 //----------------------------------------------------------------
139 
140 /**
141  * Constructs an empty set.
142  */
UnicodeSet()143 UnicodeSet::UnicodeSet() :
144     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
145     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
146     fFlags(0)
147 {
148     UErrorCode status = U_ZERO_ERROR;
149     allocateStrings(status);
150     if (U_FAILURE(status)) {
151         return;
152     }
153     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
154     if(list!=NULL){
155         list[0] = UNICODESET_HIGH;
156     } else { // If memory allocation failed, set to bogus state.
157         setToBogus();
158         return;
159     }
160     _dbgct(this);
161 }
162 
163 /**
164  * Constructs a set containing the given range. If <code>end >
165  * start</code> then an empty set is created.
166  *
167  * @param start first character, inclusive, of range
168  * @param end last character, inclusive, of range
169  */
UnicodeSet(UChar32 start,UChar32 end)170 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
171     len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
172     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
173     fFlags(0)
174 {
175     UErrorCode status = U_ZERO_ERROR;
176     allocateStrings(status);
177     if (U_FAILURE(status)) {
178         return;
179     }
180     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
181     if(list!=NULL){
182         list[0] = UNICODESET_HIGH;
183         complement(start, end);
184     } else { // If memory allocation failed, set to bogus state.
185         setToBogus();
186         return;
187     }
188     _dbgct(this);
189 }
190 
191 /**
192  * Constructs a set that is identical to the given UnicodeSet.
193  */
UnicodeSet(const UnicodeSet & o)194 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
195     UnicodeFilter(o),
196     len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
197     bmpSet(0),
198     buffer(0), bufferCapacity(0),
199     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
200     fFlags(0)
201 {
202     UErrorCode status = U_ZERO_ERROR;
203     allocateStrings(status);
204     if (U_FAILURE(status)) {
205         return;
206     }
207     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
208     if(list!=NULL){
209         *this = o;
210     } else { // If memory allocation failed, set to bogus state.
211         setToBogus();
212         return;
213     }
214     _dbgct(this);
215 }
216 
217 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)218 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
219     UnicodeFilter(o),
220     len(0), capacity(o.len + GROW_EXTRA), list(0),
221     bmpSet(0),
222     buffer(0), bufferCapacity(0),
223     patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
224     fFlags(0)
225 {
226     UErrorCode status = U_ZERO_ERROR;
227     allocateStrings(status);
228     if (U_FAILURE(status)) {
229         return;
230     }
231     list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
232     if(list!=NULL){
233         // *this = o except for bmpSet and stringSpan
234         len = o.len;
235         uprv_memcpy(list, o.list, len*sizeof(UChar32));
236         if (strings != NULL && o.strings != NULL) {
237             strings->assign(*o.strings, cloneUnicodeString, status);
238         } else { // Invalid strings.
239             setToBogus();
240             return;
241         }
242         if (o.pat) {
243             setPattern(UnicodeString(o.pat, o.patLen));
244         }
245     } else { // If memory allocation failed, set to bogus state.
246         setToBogus();
247         return;
248     }
249     _dbgct(this);
250 }
251 
252 /**
253  * Destructs the set.
254  */
~UnicodeSet()255 UnicodeSet::~UnicodeSet() {
256     _dbgdt(this); // first!
257     uprv_free(list);
258     delete bmpSet;
259     if (buffer) {
260         uprv_free(buffer);
261     }
262     delete strings;
263     delete stringSpan;
264     releasePattern();
265 }
266 
267 /**
268  * Assigns this object to be a copy of another.
269  */
operator =(const UnicodeSet & o)270 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
271     if (this == &o) {
272         return *this;
273     }
274     if (isFrozen()) {
275         return *this;
276     }
277     if (o.isBogus()) {
278         setToBogus();
279         return *this;
280     }
281     UErrorCode ec = U_ZERO_ERROR;
282     ensureCapacity(o.len, ec);
283     if (U_FAILURE(ec)) {
284         return *this; // There is no way to report this error :-(
285     }
286     len = o.len;
287     uprv_memcpy(list, o.list, len*sizeof(UChar32));
288     if (o.bmpSet == NULL) {
289         bmpSet = NULL;
290     } else {
291         bmpSet = new BMPSet(*o.bmpSet, list, len);
292         if (bmpSet == NULL) { // Check for memory allocation error.
293             setToBogus();
294             return *this;
295         }
296     }
297     if (strings != NULL && o.strings != NULL) {
298         strings->assign(*o.strings, cloneUnicodeString, ec);
299     } else { // Invalid strings.
300         setToBogus();
301         return *this;
302     }
303     if (o.stringSpan == NULL) {
304         stringSpan = NULL;
305     } else {
306         stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
307         if (stringSpan == NULL) { // Check for memory allocation error.
308             setToBogus();
309             return *this;
310         }
311     }
312     releasePattern();
313     if (o.pat) {
314         setPattern(UnicodeString(o.pat, o.patLen));
315     }
316     return *this;
317 }
318 
319 /**
320  * Returns a copy of this object.  All UnicodeMatcher objects have
321  * to support cloning in order to allow classes using
322  * UnicodeMatchers, such as Transliterator, to implement cloning.
323  */
clone() const324 UnicodeFunctor* UnicodeSet::clone() const {
325     return new UnicodeSet(*this);
326 }
327 
cloneAsThawed() const328 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
329     return new UnicodeSet(*this, TRUE);
330 }
331 
332 /**
333  * Compares the specified object with this set for equality.  Returns
334  * <tt>true</tt> if the two sets
335  * have the same size, and every member of the specified set is
336  * contained in this set (or equivalently, every member of this set is
337  * contained in the specified set).
338  *
339  * @param o set to be compared for equality with this set.
340  * @return <tt>true</tt> if the specified set is equal to this set.
341  */
operator ==(const UnicodeSet & o) const342 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
343     if (len != o.len) return FALSE;
344     for (int32_t i = 0; i < len; ++i) {
345         if (list[i] != o.list[i]) return FALSE;
346     }
347     if (*strings != *o.strings) return FALSE;
348     return TRUE;
349 }
350 
351 /**
352  * Returns the hash code value for this set.
353  *
354  * @return the hash code value for this set.
355  * @see Object#hashCode()
356  */
hashCode(void) const357 int32_t UnicodeSet::hashCode(void) const {
358     int32_t result = len;
359     for (int32_t i = 0; i < len; ++i) {
360         result *= 1000003;
361         result += list[i];
362     }
363     return result;
364 }
365 
366 //----------------------------------------------------------------
367 // Public API
368 //----------------------------------------------------------------
369 
370 /**
371  * Returns the number of elements in this set (its cardinality),
372  * Note than the elements of a set may include both individual
373  * codepoints and strings.
374  *
375  * @return the number of elements in this set (its cardinality).
376  */
size(void) const377 int32_t UnicodeSet::size(void) const {
378     int32_t n = 0;
379     int32_t count = getRangeCount();
380     for (int32_t i = 0; i < count; ++i) {
381         n += getRangeEnd(i) - getRangeStart(i) + 1;
382     }
383     return n + strings->size();
384 }
385 
386 /**
387  * Returns <tt>true</tt> if this set contains no elements.
388  *
389  * @return <tt>true</tt> if this set contains no elements.
390  */
isEmpty(void) const391 UBool UnicodeSet::isEmpty(void) const {
392     return len == 1 && strings->size() == 0;
393 }
394 
395 /**
396  * Returns true if this set contains the given character.
397  * @param c character to be checked for containment
398  * @return true if the test condition is met
399  */
contains(UChar32 c) const400 UBool UnicodeSet::contains(UChar32 c) const {
401     // Set i to the index of the start item greater than ch
402     // We know we will terminate without length test!
403     // LATER: for large sets, add binary search
404     //int32_t i = -1;
405     //for (;;) {
406     //    if (c < list[++i]) break;
407     //}
408     if (bmpSet != NULL) {
409         return bmpSet->contains(c);
410     }
411     if (stringSpan != NULL) {
412         return stringSpan->contains(c);
413     }
414     if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
415         return FALSE;
416     }
417     int32_t i = findCodePoint(c);
418     return (UBool)(i & 1); // return true if odd
419 }
420 
421 /**
422  * Returns the smallest value i such that c < list[i].  Caller
423  * must ensure that c is a legal value or this method will enter
424  * an infinite loop.  This method performs a binary search.
425  * @param c a character in the range MIN_VALUE..MAX_VALUE
426  * inclusive
427  * @return the smallest integer i in the range 0..len-1,
428  * inclusive, such that c < list[i]
429  */
findCodePoint(UChar32 c) const430 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
431     /* Examples:
432                                        findCodePoint(c)
433        set              list[]         c=0 1 3 4 7 8
434        ===              ==============   ===========
435        []               [110000]         0 0 0 0 0 0
436        [\u0000-\u0003]  [0, 4, 110000]   1 1 1 2 2 2
437        [\u0004-\u0007]  [4, 8, 110000]   0 0 0 1 1 2
438        [:Any:]          [0, 110000]      1 1 1 1 1 1
439      */
440 
441     // Return the smallest i such that c < list[i].  Assume
442     // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
443     if (c < list[0])
444         return 0;
445     // High runner test.  c is often after the last range, so an
446     // initial check for this condition pays off.
447     int32_t lo = 0;
448     int32_t hi = len - 1;
449     if (lo >= hi || c >= list[hi-1])
450         return hi;
451     // invariant: c >= list[lo]
452     // invariant: c < list[hi]
453     for (;;) {
454         int32_t i = (lo + hi) >> 1;
455         if (i == lo) {
456             break; // Found!
457         } else if (c < list[i]) {
458             hi = i;
459         } else {
460             lo = i;
461         }
462     }
463     return hi;
464 }
465 
466 /**
467  * Returns true if this set contains every character
468  * of the given range.
469  * @param start first character, inclusive, of the range
470  * @param end last character, inclusive, of the range
471  * @return true if the test condition is met
472  */
contains(UChar32 start,UChar32 end) const473 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
474     //int32_t i = -1;
475     //for (;;) {
476     //    if (start < list[++i]) break;
477     //}
478     int32_t i = findCodePoint(start);
479     return ((i & 1) != 0 && end < list[i]);
480 }
481 
482 /**
483  * Returns <tt>true</tt> if this set contains the given
484  * multicharacter string.
485  * @param s string to be checked for containment
486  * @return <tt>true</tt> if this set contains the specified string
487  */
contains(const UnicodeString & s) const488 UBool UnicodeSet::contains(const UnicodeString& s) const {
489     if (s.length() == 0) return FALSE;
490     int32_t cp = getSingleCP(s);
491     if (cp < 0) {
492         return strings->contains((void*) &s);
493     } else {
494         return contains((UChar32) cp);
495     }
496 }
497 
498 /**
499  * Returns true if this set contains all the characters and strings
500  * of the given set.
501  * @param c set to be checked for containment
502  * @return true if the test condition is met
503  */
containsAll(const UnicodeSet & c) const504 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
505     // The specified set is a subset if all of its pairs are contained in
506     // this set.  It's possible to code this more efficiently in terms of
507     // direct manipulation of the inversion lists if the need arises.
508     int32_t n = c.getRangeCount();
509     for (int i=0; i<n; ++i) {
510         if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
511             return FALSE;
512         }
513     }
514     if (!strings->containsAll(*c.strings)) return FALSE;
515     return TRUE;
516 }
517 
518 /**
519  * Returns true if this set contains all the characters
520  * of the given string.
521  * @param s string containing characters to be checked for containment
522  * @return true if the test condition is met
523  */
containsAll(const UnicodeString & s) const524 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
525     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
526                    s.length());
527 }
528 
529 /**
530  * Returns true if this set contains none of the characters
531  * of the given range.
532  * @param start first character, inclusive, of the range
533  * @param end last character, inclusive, of the range
534  * @return true if the test condition is met
535  */
containsNone(UChar32 start,UChar32 end) const536 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
537     //int32_t i = -1;
538     //for (;;) {
539     //    if (start < list[++i]) break;
540     //}
541     int32_t i = findCodePoint(start);
542     return ((i & 1) == 0 && end < list[i]);
543 }
544 
545 /**
546  * Returns true if this set contains none of the characters and strings
547  * of the given set.
548  * @param c set to be checked for containment
549  * @return true if the test condition is met
550  */
containsNone(const UnicodeSet & c) const551 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
552     // The specified set is a subset if all of its pairs are contained in
553     // this set.  It's possible to code this more efficiently in terms of
554     // direct manipulation of the inversion lists if the need arises.
555     int32_t n = c.getRangeCount();
556     for (int32_t i=0; i<n; ++i) {
557         if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
558             return FALSE;
559         }
560     }
561     if (!strings->containsNone(*c.strings)) return FALSE;
562     return TRUE;
563 }
564 
565 /**
566  * Returns true if this set contains none of the characters
567  * of the given string.
568  * @param s string containing characters to be checked for containment
569  * @return true if the test condition is met
570  */
containsNone(const UnicodeString & s) const571 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
572     return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
573                    s.length());
574 }
575 
576 /**
577  * Returns <tt>true</tt> if this set contains any character whose low byte
578  * is the given value.  This is used by <tt>RuleBasedTransliterator</tt> for
579  * indexing.
580  */
matchesIndexValue(uint8_t v) const581 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
582     /* The index value v, in the range [0,255], is contained in this set if
583      * it is contained in any pair of this set.  Pairs either have the high
584      * bytes equal, or unequal.  If the high bytes are equal, then we have
585      * aaxx..aayy, where aa is the high byte.  Then v is contained if xx <=
586      * v <= yy.  If the high bytes are unequal we have aaxx..bbyy, bb>aa.
587      * Then v is contained if xx <= v || v <= yy.  (This is identical to the
588      * time zone month containment logic.)
589      */
590     int32_t i;
591     int32_t rangeCount=getRangeCount();
592     for (i=0; i<rangeCount; ++i) {
593         UChar32 low = getRangeStart(i);
594         UChar32 high = getRangeEnd(i);
595         if ((low & ~0xFF) == (high & ~0xFF)) {
596             if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
597                 return TRUE;
598             }
599         } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
600             return TRUE;
601         }
602     }
603     if (strings->size() != 0) {
604         for (i=0; i<strings->size(); ++i) {
605             const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
606             //if (s.length() == 0) {
607             //    // Empty strings match everything
608             //    return TRUE;
609             //}
610             // assert(s.length() != 0); // We enforce this elsewhere
611             UChar32 c = s.char32At(0);
612             if ((c & 0xFF) == v) {
613                 return TRUE;
614             }
615         }
616     }
617     return FALSE;
618 }
619 
620 /**
621  * Implementation of UnicodeMatcher::matches().  Always matches the
622  * longest possible multichar string.
623  */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)624 UMatchDegree UnicodeSet::matches(const Replaceable& text,
625                                  int32_t& offset,
626                                  int32_t limit,
627                                  UBool incremental) {
628     if (offset == limit) {
629         // Strings, if any, have length != 0, so we don't worry
630         // about them here.  If we ever allow zero-length strings
631         // we much check for them here.
632         if (contains(U_ETHER)) {
633             return incremental ? U_PARTIAL_MATCH : U_MATCH;
634         } else {
635             return U_MISMATCH;
636         }
637     } else {
638         if (strings->size() != 0) { // try strings first
639 
640             // might separate forward and backward loops later
641             // for now they are combined
642 
643             // TODO Improve efficiency of this, at least in the forward
644             // direction, if not in both.  In the forward direction we
645             // can assume the strings are sorted.
646 
647             int32_t i;
648             UBool forward = offset < limit;
649 
650             // firstChar is the leftmost char to match in the
651             // forward direction or the rightmost char to match in
652             // the reverse direction.
653             UChar firstChar = text.charAt(offset);
654 
655             // If there are multiple strings that can match we
656             // return the longest match.
657             int32_t highWaterLength = 0;
658 
659             for (i=0; i<strings->size(); ++i) {
660                 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
661 
662                 //if (trial.length() == 0) {
663                 //    return U_MATCH; // null-string always matches
664                 //}
665                 // assert(trial.length() != 0); // We ensure this elsewhere
666 
667                 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
668 
669                 // Strings are sorted, so we can optimize in the
670                 // forward direction.
671                 if (forward && c > firstChar) break;
672                 if (c != firstChar) continue;
673 
674                 int32_t matchLen = matchRest(text, offset, limit, trial);
675 
676                 if (incremental) {
677                     int32_t maxLen = forward ? limit-offset : offset-limit;
678                     if (matchLen == maxLen) {
679                         // We have successfully matched but only up to limit.
680                         return U_PARTIAL_MATCH;
681                     }
682                 }
683 
684                 if (matchLen == trial.length()) {
685                     // We have successfully matched the whole string.
686                     if (matchLen > highWaterLength) {
687                         highWaterLength = matchLen;
688                     }
689                     // In the forward direction we know strings
690                     // are sorted so we can bail early.
691                     if (forward && matchLen < highWaterLength) {
692                         break;
693                     }
694                     continue;
695                 }
696             }
697 
698             // We've checked all strings without a partial match.
699             // If we have full matches, return the longest one.
700             if (highWaterLength != 0) {
701                 offset += forward ? highWaterLength : -highWaterLength;
702                 return U_MATCH;
703             }
704         }
705         return UnicodeFilter::matches(text, offset, limit, incremental);
706     }
707 }
708 
709 /**
710  * Returns the longest match for s in text at the given position.
711  * If limit > start then match forward from start+1 to limit
712  * matching all characters except s.charAt(0).  If limit < start,
713  * go backward starting from start-1 matching all characters
714  * except s.charAt(s.length()-1).  This method assumes that the
715  * first character, text.charAt(start), matches s, so it does not
716  * check it.
717  * @param text the text to match
718  * @param start the first character to match.  In the forward
719  * direction, text.charAt(start) is matched against s.charAt(0).
720  * In the reverse direction, it is matched against
721  * s.charAt(s.length()-1).
722  * @param limit the limit offset for matching, either last+1 in
723  * the forward direction, or last-1 in the reverse direction,
724  * where last is the index of the last character to match.
725  * @return If part of s matches up to the limit, return |limit -
726  * start|.  If all of s matches before reaching the limit, return
727  * s.length().  If there is a mismatch between s and text, return
728  * 0
729  */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)730 int32_t UnicodeSet::matchRest(const Replaceable& text,
731                               int32_t start, int32_t limit,
732                               const UnicodeString& s) {
733     int32_t i;
734     int32_t maxLen;
735     int32_t slen = s.length();
736     if (start < limit) {
737         maxLen = limit - start;
738         if (maxLen > slen) maxLen = slen;
739         for (i = 1; i < maxLen; ++i) {
740             if (text.charAt(start + i) != s.charAt(i)) return 0;
741         }
742     } else {
743         maxLen = start - limit;
744         if (maxLen > slen) maxLen = slen;
745         --slen; // <=> slen = s.length() - 1;
746         for (i = 1; i < maxLen; ++i) {
747             if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
748         }
749     }
750     return maxLen;
751 }
752 
753 /**
754  * Implement of UnicodeMatcher
755  */
addMatchSetTo(UnicodeSet & toUnionTo) const756 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
757     toUnionTo.addAll(*this);
758 }
759 
760 /**
761  * Returns the index of the given character within this set, where
762  * the set is ordered by ascending code point.  If the character
763  * is not in this set, return -1.  The inverse of this method is
764  * <code>charAt()</code>.
765  * @return an index from 0..size()-1, or -1
766  */
indexOf(UChar32 c) const767 int32_t UnicodeSet::indexOf(UChar32 c) const {
768     if (c < MIN_VALUE || c > MAX_VALUE) {
769         return -1;
770     }
771     int32_t i = 0;
772     int32_t n = 0;
773     for (;;) {
774         UChar32 start = list[i++];
775         if (c < start) {
776             return -1;
777         }
778         UChar32 limit = list[i++];
779         if (c < limit) {
780             return n + c - start;
781         }
782         n += limit - start;
783     }
784 }
785 
786 /**
787  * Returns the character at the given index within this set, where
788  * the set is ordered by ascending code point.  If the index is
789  * out of range, return (UChar32)-1.  The inverse of this method is
790  * <code>indexOf()</code>.
791  * @param index an index from 0..size()-1
792  * @return the character at the given index, or (UChar32)-1.
793  */
charAt(int32_t index) const794 UChar32 UnicodeSet::charAt(int32_t index) const {
795     if (index >= 0) {
796         // len2 is the largest even integer <= len, that is, it is len
797         // for even values and len-1 for odd values.  With odd values
798         // the last entry is UNICODESET_HIGH.
799         int32_t len2 = len & ~1;
800         for (int32_t i=0; i < len2;) {
801             UChar32 start = list[i++];
802             int32_t count = list[i++] - start;
803             if (index < count) {
804                 return (UChar32)(start + index);
805             }
806             index -= count;
807         }
808     }
809     return (UChar32)-1;
810 }
811 
812 /**
813  * Make this object represent the range <code>start - end</code>.
814  * If <code>end > start</code> then this object is set to an
815  * an empty range.
816  *
817  * @param start first character in the set, inclusive
818  * @rparam end last character in the set, inclusive
819  */
set(UChar32 start,UChar32 end)820 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
821     clear();
822     complement(start, end);
823     return *this;
824 }
825 
826 /**
827  * Adds the specified range to this set if it is not already
828  * present.  If this set already contains the specified range,
829  * the call leaves this set unchanged.  If <code>end > start</code>
830  * then an empty range is added, leaving the set unchanged.
831  *
832  * @param start first character, inclusive, of range to be added
833  * to this set.
834  * @param end last character, inclusive, of range to be added
835  * to this set.
836  */
add(UChar32 start,UChar32 end)837 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
838     if (pinCodePoint(start) < pinCodePoint(end)) {
839         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
840         add(range, 2, 0);
841     } else if (start == end) {
842         add(start);
843     }
844     return *this;
845 }
846 
847 // #define DEBUG_US_ADD
848 
849 #ifdef DEBUG_US_ADD
850 #include <stdio.h>
dump(UChar32 c)851 void dump(UChar32 c) {
852     if (c <= 0xFF) {
853         printf("%c", (char)c);
854     } else {
855         printf("U+%04X", c);
856     }
857 }
dump(const UChar32 * list,int32_t len)858 void dump(const UChar32* list, int32_t len) {
859     printf("[");
860     for (int32_t i=0; i<len; ++i) {
861         if (i != 0) printf(", ");
862         dump(list[i]);
863     }
864     printf("]");
865 }
866 #endif
867 
868 /**
869  * Adds the specified character to this set if it is not already
870  * present.  If this set already contains the specified character,
871  * the call leaves this set unchanged.
872  */
add(UChar32 c)873 UnicodeSet& UnicodeSet::add(UChar32 c) {
874     // find smallest i such that c < list[i]
875     // if odd, then it is IN the set
876     // if even, then it is OUT of the set
877     int32_t i = findCodePoint(pinCodePoint(c));
878 
879     // already in set?
880     if ((i & 1) != 0  || isFrozen() || isBogus()) return *this;
881 
882     // HIGH is 0x110000
883     // assert(list[len-1] == HIGH);
884 
885     // empty = [HIGH]
886     // [start_0, limit_0, start_1, limit_1, HIGH]
887 
888     // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
889     //                             ^
890     //                             list[i]
891 
892     // i == 0 means c is before the first range
893 
894 #ifdef DEBUG_US_ADD
895     printf("Add of ");
896     dump(c);
897     printf(" found at %d", i);
898     printf(": ");
899     dump(list, len);
900     printf(" => ");
901 #endif
902 
903     if (c == list[i]-1) {
904         // c is before start of next range
905         list[i] = c;
906         // if we touched the HIGH mark, then add a new one
907         if (c == (UNICODESET_HIGH - 1)) {
908             UErrorCode status = U_ZERO_ERROR;
909             ensureCapacity(len+1, status);
910             if (U_FAILURE(status)) {
911                 return *this; // There is no way to report this error :-(
912             }
913             list[len++] = UNICODESET_HIGH;
914         }
915         if (i > 0 && c == list[i-1]) {
916             // collapse adjacent ranges
917 
918             // [..., start_k-1, c, c, limit_k, ..., HIGH]
919             //                     ^
920             //                     list[i]
921 
922             //for (int32_t k=i-1; k<len-2; ++k) {
923             //    list[k] = list[k+2];
924             //}
925             UChar32* dst = list + i - 1;
926             UChar32* src = dst + 2;
927             UChar32* srclimit = list + len;
928             while (src < srclimit) *(dst++) = *(src++);
929 
930             len -= 2;
931         }
932     }
933 
934     else if (i > 0 && c == list[i-1]) {
935         // c is after end of prior range
936         list[i-1]++;
937         // no need to check for collapse here
938     }
939 
940     else {
941         // At this point we know the new char is not adjacent to
942         // any existing ranges, and it is not 10FFFF.
943 
944 
945         // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
946         //                             ^
947         //                             list[i]
948 
949         // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
950         //                             ^
951         //                             list[i]
952 
953         UErrorCode status = U_ZERO_ERROR;
954         ensureCapacity(len+2, status);
955         if (U_FAILURE(status)) {
956             return *this; // There is no way to report this error :-(
957         }
958 
959         //for (int32_t k=len-1; k>=i; --k) {
960         //    list[k+2] = list[k];
961         //}
962         UChar32* src = list + len;
963         UChar32* dst = src + 2;
964         UChar32* srclimit = list + i;
965         while (src > srclimit) *(--dst) = *(--src);
966 
967         list[i] = c;
968         list[i+1] = c+1;
969         len += 2;
970     }
971 
972 #ifdef DEBUG_US_ADD
973     dump(list, len);
974     printf("\n");
975 
976     for (i=1; i<len; ++i) {
977         if (list[i] <= list[i-1]) {
978             // Corrupt array!
979             printf("ERROR: list has been corrupted\n");
980             exit(1);
981         }
982     }
983 #endif
984 
985     releasePattern();
986     return *this;
987 }
988 
989 /**
990  * Adds the specified multicharacter to this set if it is not already
991  * present.  If this set already contains the multicharacter,
992  * the call leaves this set unchanged.
993  * Thus "ch" => {"ch"}
994  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
995  * @param s the source string
996  * @return the modified set, for chaining
997  */
add(const UnicodeString & s)998 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
999     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1000     int32_t cp = getSingleCP(s);
1001     if (cp < 0) {
1002         if (!strings->contains((void*) &s)) {
1003             _add(s);
1004             releasePattern();
1005         }
1006     } else {
1007         add((UChar32)cp);
1008     }
1009     return *this;
1010 }
1011 
1012 /**
1013  * Adds the given string, in order, to 'strings'.  The given string
1014  * must have been checked by the caller to not be empty and to not
1015  * already be in 'strings'.
1016  */
_add(const UnicodeString & s)1017 void UnicodeSet::_add(const UnicodeString& s) {
1018     if (isFrozen() || isBogus()) {
1019         return;
1020     }
1021     UnicodeString* t = new UnicodeString(s);
1022     if (t == NULL) { // Check for memory allocation error.
1023         setToBogus();
1024         return;
1025     }
1026     UErrorCode ec = U_ZERO_ERROR;
1027     strings->sortedInsert(t, compareUnicodeString, ec);
1028     if (U_FAILURE(ec)) {
1029         setToBogus();
1030         delete t;
1031     }
1032 }
1033 
1034 /**
1035  * @return a code point IF the string consists of a single one.
1036  * otherwise returns -1.
1037  * @param string to test
1038  */
getSingleCP(const UnicodeString & s)1039 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1040     //if (s.length() < 1) {
1041     //    throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1042     //}
1043     if (s.length() > 2) return -1;
1044     if (s.length() == 1) return s.charAt(0);
1045 
1046     // at this point, len = 2
1047     UChar32 cp = s.char32At(0);
1048     if (cp > 0xFFFF) { // is surrogate pair
1049         return cp;
1050     }
1051     return -1;
1052 }
1053 
1054 /**
1055  * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1056  * If this set already any particular character, it has no effect on that character.
1057  * @param the source string
1058  * @return the modified set, for chaining
1059  */
addAll(const UnicodeString & s)1060 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1061     UChar32 cp;
1062     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
1063         cp = s.char32At(i);
1064         add(cp);
1065     }
1066     return *this;
1067 }
1068 
1069 /**
1070  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1071  * If this set already any particular character, it has no effect on that character.
1072  * @param the source string
1073  * @return the modified set, for chaining
1074  */
retainAll(const UnicodeString & s)1075 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1076     UnicodeSet set;
1077     set.addAll(s);
1078     retainAll(set);
1079     return *this;
1080 }
1081 
1082 /**
1083  * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1084  * If this set already any particular character, it has no effect on that character.
1085  * @param the source string
1086  * @return the modified set, for chaining
1087  */
complementAll(const UnicodeString & s)1088 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1089     UnicodeSet set;
1090     set.addAll(s);
1091     complementAll(set);
1092     return *this;
1093 }
1094 
1095 /**
1096  * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1097  * If this set already any particular character, it has no effect on that character.
1098  * @param the source string
1099  * @return the modified set, for chaining
1100  */
removeAll(const UnicodeString & s)1101 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1102     UnicodeSet set;
1103     set.addAll(s);
1104     removeAll(set);
1105     return *this;
1106 }
1107 
removeAllStrings()1108 UnicodeSet& UnicodeSet::removeAllStrings() {
1109     strings->removeAllElements();
1110     return *this;
1111 }
1112 
1113 
1114 /**
1115  * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1116  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1117  * @param the source string
1118  * @return a newly created set containing the given string
1119  */
createFrom(const UnicodeString & s)1120 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1121     UnicodeSet *set = new UnicodeSet();
1122     if (set != NULL) { // Check for memory allocation error.
1123         set->add(s);
1124     }
1125     return set;
1126 }
1127 
1128 
1129 /**
1130  * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1131  * @param the source string
1132  * @return a newly created set containing the given characters
1133  */
createFromAll(const UnicodeString & s)1134 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1135     UnicodeSet *set = new UnicodeSet();
1136     if (set != NULL) { // Check for memory allocation error.
1137         set->addAll(s);
1138     }
1139     return set;
1140 }
1141 
1142 /**
1143  * Retain only the elements in this set that are contained in the
1144  * specified range.  If <code>end > start</code> then an empty range is
1145  * retained, leaving the set empty.
1146  *
1147  * @param start first character, inclusive, of range to be retained
1148  * to this set.
1149  * @param end last character, inclusive, of range to be retained
1150  * to this set.
1151  */
retain(UChar32 start,UChar32 end)1152 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1153     if (pinCodePoint(start) <= pinCodePoint(end)) {
1154         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1155         retain(range, 2, 0);
1156     } else {
1157         clear();
1158     }
1159     return *this;
1160 }
1161 
retain(UChar32 c)1162 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1163     return retain(c, c);
1164 }
1165 
1166 /**
1167  * Removes the specified range from this set if it is present.
1168  * The set will not contain the specified range once the call
1169  * returns.  If <code>end > start</code> then an empty range is
1170  * removed, leaving the set unchanged.
1171  *
1172  * @param start first character, inclusive, of range to be removed
1173  * from this set.
1174  * @param end last character, inclusive, of range to be removed
1175  * from this set.
1176  */
remove(UChar32 start,UChar32 end)1177 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1178     if (pinCodePoint(start) <= pinCodePoint(end)) {
1179         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1180         retain(range, 2, 2);
1181     }
1182     return *this;
1183 }
1184 
1185 /**
1186  * Removes the specified character from this set if it is present.
1187  * The set will not contain the specified range once the call
1188  * returns.
1189  */
remove(UChar32 c)1190 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1191     return remove(c, c);
1192 }
1193 
1194 /**
1195  * Removes the specified string from this set if it is present.
1196  * The set will not contain the specified character once the call
1197  * returns.
1198  * @param the source string
1199  * @return the modified set, for chaining
1200  */
remove(const UnicodeString & s)1201 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1202     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1203     int32_t cp = getSingleCP(s);
1204     if (cp < 0) {
1205         strings->removeElement((void*) &s);
1206         releasePattern();
1207     } else {
1208         remove((UChar32)cp, (UChar32)cp);
1209     }
1210     return *this;
1211 }
1212 
1213 /**
1214  * Complements the specified range in this set.  Any character in
1215  * the range will be removed if it is in this set, or will be
1216  * added if it is not in this set.  If <code>end > start</code>
1217  * then an empty range is xor'ed, leaving the set unchanged.
1218  *
1219  * @param start first character, inclusive, of range to be removed
1220  * from this set.
1221  * @param end last character, inclusive, of range to be removed
1222  * from this set.
1223  */
complement(UChar32 start,UChar32 end)1224 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1225     if (isFrozen() || isBogus()) {
1226         return *this;
1227     }
1228     if (pinCodePoint(start) <= pinCodePoint(end)) {
1229         UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1230         exclusiveOr(range, 2, 0);
1231     }
1232     releasePattern();
1233     return *this;
1234 }
1235 
complement(UChar32 c)1236 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1237     return complement(c, c);
1238 }
1239 
1240 /**
1241  * This is equivalent to
1242  * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1243  */
complement(void)1244 UnicodeSet& UnicodeSet::complement(void) {
1245     if (isFrozen() || isBogus()) {
1246         return *this;
1247     }
1248     UErrorCode status = U_ZERO_ERROR;
1249     if (list[0] == UNICODESET_LOW) {
1250         ensureBufferCapacity(len-1, status);
1251         if (U_FAILURE(status)) {
1252             return *this;
1253         }
1254         uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1255         --len;
1256     } else {
1257         ensureBufferCapacity(len+1, status);
1258         if (U_FAILURE(status)) {
1259             return *this;
1260         }
1261         uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1262         buffer[0] = UNICODESET_LOW;
1263         ++len;
1264     }
1265     swapBuffers();
1266     releasePattern();
1267     return *this;
1268 }
1269 
1270 /**
1271  * Complement the specified string in this set.
1272  * The set will not contain the specified string once the call
1273  * returns.
1274  * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1275  * @param s the string to complement
1276  * @return this object, for chaining
1277  */
complement(const UnicodeString & s)1278 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1279     if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1280     int32_t cp = getSingleCP(s);
1281     if (cp < 0) {
1282         if (strings->contains((void*) &s)) {
1283             strings->removeElement((void*) &s);
1284         } else {
1285             _add(s);
1286         }
1287         releasePattern();
1288     } else {
1289         complement((UChar32)cp, (UChar32)cp);
1290     }
1291     return *this;
1292 }
1293 
1294 /**
1295  * Adds all of the elements in the specified set to this set if
1296  * they're not already present.  This operation effectively
1297  * modifies this set so that its value is the <i>union</i> of the two
1298  * sets.  The behavior of this operation is unspecified if the specified
1299  * collection is modified while the operation is in progress.
1300  *
1301  * @param c set whose elements are to be added to this set.
1302  * @see #add(char, char)
1303  */
addAll(const UnicodeSet & c)1304 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1305     if ( c.len>0 && c.list!=NULL ) {
1306         add(c.list, c.len, 0);
1307     }
1308 
1309     // Add strings in order
1310     if ( c.strings!=NULL ) {
1311         for (int32_t i=0; i<c.strings->size(); ++i) {
1312             const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1313             if (!strings->contains((void*) s)) {
1314                 _add(*s);
1315             }
1316         }
1317     }
1318     return *this;
1319 }
1320 
1321 /**
1322  * Retains only the elements in this set that are contained in the
1323  * specified set.  In other words, removes from this set all of
1324  * its elements that are not contained in the specified set.  This
1325  * operation effectively modifies this set so that its value is
1326  * the <i>intersection</i> of the two sets.
1327  *
1328  * @param c set that defines which elements this set will retain.
1329  */
retainAll(const UnicodeSet & c)1330 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1331     if (isFrozen() || isBogus()) {
1332         return *this;
1333     }
1334     retain(c.list, c.len, 0);
1335     strings->retainAll(*c.strings);
1336     return *this;
1337 }
1338 
1339 /**
1340  * Removes from this set all of its elements that are contained in the
1341  * specified set.  This operation effectively modifies this
1342  * set so that its value is the <i>asymmetric set difference</i> of
1343  * the two sets.
1344  *
1345  * @param c set that defines which elements will be removed from
1346  *          this set.
1347  */
removeAll(const UnicodeSet & c)1348 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1349     if (isFrozen() || isBogus()) {
1350         return *this;
1351     }
1352     retain(c.list, c.len, 2);
1353     strings->removeAll(*c.strings);
1354     return *this;
1355 }
1356 
1357 /**
1358  * Complements in this set all elements contained in the specified
1359  * set.  Any character in the other set will be removed if it is
1360  * in this set, or will be added if it is not in this set.
1361  *
1362  * @param c set that defines which elements will be xor'ed from
1363  *          this set.
1364  */
complementAll(const UnicodeSet & c)1365 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1366     if (isFrozen() || isBogus()) {
1367         return *this;
1368     }
1369     exclusiveOr(c.list, c.len, 0);
1370 
1371     for (int32_t i=0; i<c.strings->size(); ++i) {
1372         void* e = c.strings->elementAt(i);
1373         if (!strings->removeElement(e)) {
1374             _add(*(const UnicodeString*)e);
1375         }
1376     }
1377     return *this;
1378 }
1379 
1380 /**
1381  * Removes all of the elements from this set.  This set will be
1382  * empty after this call returns.
1383  */
clear(void)1384 UnicodeSet& UnicodeSet::clear(void) {
1385     if (isFrozen()) {
1386         return *this;
1387     }
1388     if (list != NULL) {
1389         list[0] = UNICODESET_HIGH;
1390     }
1391     len = 1;
1392     releasePattern();
1393     if (strings != NULL) {
1394         strings->removeAllElements();
1395     }
1396     if (list != NULL && strings != NULL) {
1397         // Remove bogus
1398         fFlags = 0;
1399     }
1400     return *this;
1401 }
1402 
1403 /**
1404  * Iteration method that returns the number of ranges contained in
1405  * this set.
1406  * @see #getRangeStart
1407  * @see #getRangeEnd
1408  */
getRangeCount() const1409 int32_t UnicodeSet::getRangeCount() const {
1410     return len/2;
1411 }
1412 
1413 /**
1414  * Iteration method that returns the first character in the
1415  * specified range of this set.
1416  * @see #getRangeCount
1417  * @see #getRangeEnd
1418  */
getRangeStart(int32_t index) const1419 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1420     return list[index*2];
1421 }
1422 
1423 /**
1424  * Iteration method that returns the last character in the
1425  * specified range of this set.
1426  * @see #getRangeStart
1427  * @see #getRangeEnd
1428  */
getRangeEnd(int32_t index) const1429 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1430     return list[index*2 + 1] - 1;
1431 }
1432 
getStringCount() const1433 int32_t UnicodeSet::getStringCount() const {
1434     return strings->size();
1435 }
1436 
getString(int32_t index) const1437 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1438     return (const UnicodeString*) strings->elementAt(index);
1439 }
1440 
1441 /**
1442  * Reallocate this objects internal structures to take up the least
1443  * possible space, without changing this object's value.
1444  */
compact()1445 UnicodeSet& UnicodeSet::compact() {
1446     if (isFrozen() || isBogus()) {
1447         return *this;
1448     }
1449     // Delete buffer first to defragment memory less.
1450     if (buffer != NULL) {
1451         uprv_free(buffer);
1452         buffer = NULL;
1453     }
1454     if (len < capacity) {
1455         // Make the capacity equal to len or 1.
1456         // We don't want to realloc of 0 size.
1457         int32_t newCapacity = len + (len == 0);
1458         UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1459         if (temp) {
1460             list = temp;
1461             capacity = newCapacity;
1462         }
1463         // else what the heck happened?! We allocated less memory!
1464         // Oh well. We'll keep our original array.
1465     }
1466     return *this;
1467 }
1468 
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1469 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1470     int32_t bmpLength, length, destLength;
1471 
1472     if (U_FAILURE(ec)) {
1473         return 0;
1474     }
1475 
1476     if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1477         ec=U_ILLEGAL_ARGUMENT_ERROR;
1478         return 0;
1479     }
1480 
1481     /* count necessary 16-bit units */
1482     length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1483     // assert(length>=0);
1484     if (length==0) {
1485         /* empty set */
1486         if (destCapacity>0) {
1487             *dest=0;
1488         } else {
1489             ec=U_BUFFER_OVERFLOW_ERROR;
1490         }
1491         return 1;
1492     }
1493     /* now length>0 */
1494 
1495     if (this->list[length-1]<=0xffff) {
1496         /* all BMP */
1497         bmpLength=length;
1498     } else if (this->list[0]>=0x10000) {
1499         /* all supplementary */
1500         bmpLength=0;
1501         length*=2;
1502     } else {
1503         /* some BMP, some supplementary */
1504         for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1505         length=bmpLength+2*(length-bmpLength);
1506     }
1507 
1508     /* length: number of 16-bit array units */
1509     if (length>0x7fff) {
1510         /* there are only 15 bits for the length in the first serialized word */
1511         ec=U_INDEX_OUTOFBOUNDS_ERROR;
1512         return 0;
1513     }
1514 
1515     /*
1516      * total serialized length:
1517      * number of 16-bit array units (length) +
1518      * 1 length unit (always) +
1519      * 1 bmpLength unit (if there are supplementary values)
1520      */
1521     destLength=length+((length>bmpLength)?2:1);
1522     if (destLength<=destCapacity) {
1523         const UChar32 *p;
1524         int32_t i;
1525 
1526         *dest=(uint16_t)length;
1527         if (length>bmpLength) {
1528             *dest|=0x8000;
1529             *++dest=(uint16_t)bmpLength;
1530         }
1531         ++dest;
1532 
1533         /* write the BMP part of the array */
1534         p=this->list;
1535         for (i=0; i<bmpLength; ++i) {
1536             *dest++=(uint16_t)*p++;
1537         }
1538 
1539         /* write the supplementary part of the array */
1540         for (; i<length; i+=2) {
1541             *dest++=(uint16_t)(*p>>16);
1542             *dest++=(uint16_t)*p++;
1543         }
1544     } else {
1545         ec=U_BUFFER_OVERFLOW_ERROR;
1546     }
1547     return destLength;
1548 }
1549 
1550 //----------------------------------------------------------------
1551 // Implementation: Utility methods
1552 //----------------------------------------------------------------
1553 
1554 /**
1555  * Allocate our strings vector and return TRUE if successful.
1556  */
allocateStrings(UErrorCode & status)1557 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1558     if (U_FAILURE(status)) {
1559         return FALSE;
1560     }
1561     strings = new UVector(uhash_deleteUnicodeString,
1562                           uhash_compareUnicodeString, 1, status);
1563     if (strings == NULL) { // Check for memory allocation error.
1564         status = U_MEMORY_ALLOCATION_ERROR;
1565         return FALSE;
1566     }
1567     if (U_FAILURE(status)) {
1568         delete strings;
1569         strings = NULL;
1570         return FALSE;
1571     }
1572     return TRUE;
1573 }
1574 
ensureCapacity(int32_t newLen,UErrorCode & ec)1575 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1576     if (newLen <= capacity)
1577         return;
1578     UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1579     if (temp == NULL) {
1580         ec = U_MEMORY_ALLOCATION_ERROR;
1581         setToBogus();
1582         return;
1583     }
1584     list = temp;
1585     capacity = newLen + GROW_EXTRA;
1586     // else we keep the original contents on the memory failure.
1587 }
1588 
ensureBufferCapacity(int32_t newLen,UErrorCode & ec)1589 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1590     if (buffer != NULL && newLen <= bufferCapacity)
1591         return;
1592     UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1593     if (temp == NULL) {
1594         ec = U_MEMORY_ALLOCATION_ERROR;
1595         setToBogus();
1596         return;
1597     }
1598     buffer = temp;
1599     bufferCapacity = newLen + GROW_EXTRA;
1600     // else we keep the original contents on the memory failure.
1601 }
1602 
1603 /**
1604  * Swap list and buffer.
1605  */
swapBuffers(void)1606 void UnicodeSet::swapBuffers(void) {
1607     // swap list and buffer
1608     UChar32* temp = list;
1609     list = buffer;
1610     buffer = temp;
1611 
1612     int32_t c = capacity;
1613     capacity = bufferCapacity;
1614     bufferCapacity = c;
1615 }
1616 
setToBogus()1617 void UnicodeSet::setToBogus() {
1618     clear(); // Remove everything in the set.
1619     fFlags = kIsBogus;
1620 }
1621 
1622 //----------------------------------------------------------------
1623 // Implementation: Fundamental operators
1624 //----------------------------------------------------------------
1625 
max(UChar32 a,UChar32 b)1626 static inline UChar32 max(UChar32 a, UChar32 b) {
1627     return (a > b) ? a : b;
1628 }
1629 
1630 // polarity = 0, 3 is normal: x xor y
1631 // polarity = 1, 2: x xor ~y == x === y
1632 
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1633 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1634     if (isFrozen() || isBogus()) {
1635         return;
1636     }
1637     UErrorCode status = U_ZERO_ERROR;
1638     ensureBufferCapacity(len + otherLen, status);
1639     if (U_FAILURE(status)) {
1640         return;
1641     }
1642 
1643     int32_t i = 0, j = 0, k = 0;
1644     UChar32 a = list[i++];
1645     UChar32 b;
1646     if (polarity == 1 || polarity == 2) {
1647         b = UNICODESET_LOW;
1648         if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1649             ++j;
1650             b = other[j];
1651         }
1652     } else {
1653         b = other[j++];
1654     }
1655     // simplest of all the routines
1656     // sort the values, discarding identicals!
1657     for (;;) {
1658         if (a < b) {
1659             buffer[k++] = a;
1660             a = list[i++];
1661         } else if (b < a) {
1662             buffer[k++] = b;
1663             b = other[j++];
1664         } else if (a != UNICODESET_HIGH) { // at this point, a == b
1665             // discard both values!
1666             a = list[i++];
1667             b = other[j++];
1668         } else { // DONE!
1669             buffer[k++] = UNICODESET_HIGH;
1670             len = k;
1671             break;
1672         }
1673     }
1674     swapBuffers();
1675     releasePattern();
1676 }
1677 
1678 // polarity = 0 is normal: x union y
1679 // polarity = 2: x union ~y
1680 // polarity = 1: ~x union y
1681 // polarity = 3: ~x union ~y
1682 
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1683 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1684     if (isFrozen() || isBogus() || other==NULL) {
1685         return;
1686     }
1687     UErrorCode status = U_ZERO_ERROR;
1688     ensureBufferCapacity(len + otherLen, status);
1689     if (U_FAILURE(status)) {
1690         return;
1691     }
1692 
1693     int32_t i = 0, j = 0, k = 0;
1694     UChar32 a = list[i++];
1695     UChar32 b = other[j++];
1696     // change from xor is that we have to check overlapping pairs
1697     // polarity bit 1 means a is second, bit 2 means b is.
1698     for (;;) {
1699         switch (polarity) {
1700           case 0: // both first; take lower if unequal
1701             if (a < b) { // take a
1702                 // Back up over overlapping ranges in buffer[]
1703                 if (k > 0 && a <= buffer[k-1]) {
1704                     // Pick latter end value in buffer[] vs. list[]
1705                     a = max(list[i], buffer[--k]);
1706                 } else {
1707                     // No overlap
1708                     buffer[k++] = a;
1709                     a = list[i];
1710                 }
1711                 i++; // Common if/else code factored out
1712                 polarity ^= 1;
1713             } else if (b < a) { // take b
1714                 if (k > 0 && b <= buffer[k-1]) {
1715                     b = max(other[j], buffer[--k]);
1716                 } else {
1717                     buffer[k++] = b;
1718                     b = other[j];
1719                 }
1720                 j++;
1721                 polarity ^= 2;
1722             } else { // a == b, take a, drop b
1723                 if (a == UNICODESET_HIGH) goto loop_end;
1724                 // This is symmetrical; it doesn't matter if
1725                 // we backtrack with a or b. - liu
1726                 if (k > 0 && a <= buffer[k-1]) {
1727                     a = max(list[i], buffer[--k]);
1728                 } else {
1729                     // No overlap
1730                     buffer[k++] = a;
1731                     a = list[i];
1732                 }
1733                 i++;
1734                 polarity ^= 1;
1735                 b = other[j++];
1736                 polarity ^= 2;
1737             }
1738             break;
1739           case 3: // both second; take higher if unequal, and drop other
1740             if (b <= a) { // take a
1741                 if (a == UNICODESET_HIGH) goto loop_end;
1742                 buffer[k++] = a;
1743             } else { // take b
1744                 if (b == UNICODESET_HIGH) goto loop_end;
1745                 buffer[k++] = b;
1746             }
1747             a = list[i++];
1748             polarity ^= 1;   // factored common code
1749             b = other[j++];
1750             polarity ^= 2;
1751             break;
1752           case 1: // a second, b first; if b < a, overlap
1753             if (a < b) { // no overlap, take a
1754                 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1755             } else if (b < a) { // OVERLAP, drop b
1756                 b = other[j++];
1757                 polarity ^= 2;
1758             } else { // a == b, drop both!
1759                 if (a == UNICODESET_HIGH) goto loop_end;
1760                 a = list[i++];
1761                 polarity ^= 1;
1762                 b = other[j++];
1763                 polarity ^= 2;
1764             }
1765             break;
1766           case 2: // a first, b second; if a < b, overlap
1767             if (b < a) { // no overlap, take b
1768                 buffer[k++] = b;
1769                 b = other[j++];
1770                 polarity ^= 2;
1771             } else  if (a < b) { // OVERLAP, drop a
1772                 a = list[i++];
1773                 polarity ^= 1;
1774             } else { // a == b, drop both!
1775                 if (a == UNICODESET_HIGH) goto loop_end;
1776                 a = list[i++];
1777                 polarity ^= 1;
1778                 b = other[j++];
1779                 polarity ^= 2;
1780             }
1781             break;
1782         }
1783     }
1784  loop_end:
1785     buffer[k++] = UNICODESET_HIGH;    // terminate
1786     len = k;
1787     swapBuffers();
1788     releasePattern();
1789 }
1790 
1791 // polarity = 0 is normal: x intersect y
1792 // polarity = 2: x intersect ~y == set-minus
1793 // polarity = 1: ~x intersect y
1794 // polarity = 3: ~x intersect ~y
1795 
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1796 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1797     if (isFrozen() || isBogus()) {
1798         return;
1799     }
1800     UErrorCode status = U_ZERO_ERROR;
1801     ensureBufferCapacity(len + otherLen, status);
1802     if (U_FAILURE(status)) {
1803         return;
1804     }
1805 
1806     int32_t i = 0, j = 0, k = 0;
1807     UChar32 a = list[i++];
1808     UChar32 b = other[j++];
1809     // change from xor is that we have to check overlapping pairs
1810     // polarity bit 1 means a is second, bit 2 means b is.
1811     for (;;) {
1812         switch (polarity) {
1813           case 0: // both first; drop the smaller
1814             if (a < b) { // drop a
1815                 a = list[i++];
1816                 polarity ^= 1;
1817             } else if (b < a) { // drop b
1818                 b = other[j++];
1819                 polarity ^= 2;
1820             } else { // a == b, take one, drop other
1821                 if (a == UNICODESET_HIGH) goto loop_end;
1822                 buffer[k++] = a;
1823                 a = list[i++];
1824                 polarity ^= 1;
1825                 b = other[j++];
1826                 polarity ^= 2;
1827             }
1828             break;
1829           case 3: // both second; take lower if unequal
1830             if (a < b) { // take a
1831                 buffer[k++] = a;
1832                 a = list[i++];
1833                 polarity ^= 1;
1834             } else if (b < a) { // take b
1835                 buffer[k++] = b;
1836                 b = other[j++];
1837                 polarity ^= 2;
1838             } else { // a == b, take one, drop other
1839                 if (a == UNICODESET_HIGH) goto loop_end;
1840                 buffer[k++] = a;
1841                 a = list[i++];
1842                 polarity ^= 1;
1843                 b = other[j++];
1844                 polarity ^= 2;
1845             }
1846             break;
1847           case 1: // a second, b first;
1848             if (a < b) { // NO OVERLAP, drop a
1849                 a = list[i++];
1850                 polarity ^= 1;
1851             } else if (b < a) { // OVERLAP, take b
1852                 buffer[k++] = b;
1853                 b = other[j++];
1854                 polarity ^= 2;
1855             } else { // a == b, drop both!
1856                 if (a == UNICODESET_HIGH) goto loop_end;
1857                 a = list[i++];
1858                 polarity ^= 1;
1859                 b = other[j++];
1860                 polarity ^= 2;
1861             }
1862             break;
1863           case 2: // a first, b second; if a < b, overlap
1864             if (b < a) { // no overlap, drop b
1865                 b = other[j++];
1866                 polarity ^= 2;
1867             } else  if (a < b) { // OVERLAP, take a
1868                 buffer[k++] = a;
1869                 a = list[i++];
1870                 polarity ^= 1;
1871             } else { // a == b, drop both!
1872                 if (a == UNICODESET_HIGH) goto loop_end;
1873                 a = list[i++];
1874                 polarity ^= 1;
1875                 b = other[j++];
1876                 polarity ^= 2;
1877             }
1878             break;
1879         }
1880     }
1881  loop_end:
1882     buffer[k++] = UNICODESET_HIGH;    // terminate
1883     len = k;
1884     swapBuffers();
1885     releasePattern();
1886 }
1887 
1888 /**
1889  * Append the <code>toPattern()</code> representation of a
1890  * string to the given <code>StringBuffer</code>.
1891  */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1892 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1893 escapeUnprintable) {
1894     UChar32 cp;
1895     for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
1896         _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1897     }
1898 }
1899 
1900 /**
1901  * Append the <code>toPattern()</code> representation of a
1902  * character to the given <code>StringBuffer</code>.
1903  */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1904 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1905 escapeUnprintable) {
1906     if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1907         // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1908         // unprintable
1909         if (ICU_Utility::escapeUnprintable(buf, c)) {
1910             return;
1911         }
1912     }
1913     // Okay to let ':' pass through
1914     switch (c) {
1915     case SET_OPEN:
1916     case SET_CLOSE:
1917     case HYPHEN:
1918     case COMPLEMENT:
1919     case INTERSECTION:
1920     case BACKSLASH:
1921     case OPEN_BRACE:
1922     case CLOSE_BRACE:
1923     case COLON:
1924     case SymbolTable::SYMBOL_REF:
1925         buf.append(BACKSLASH);
1926         break;
1927     default:
1928         // Escape whitespace
1929         if (uprv_isRuleWhiteSpace(c)) {
1930             buf.append(BACKSLASH);
1931         }
1932         break;
1933     }
1934     buf.append(c);
1935 }
1936 
1937 /**
1938  * Append a string representation of this set to result.  This will be
1939  * a cleaned version of the string passed to applyPattern(), if there
1940  * is one.  Otherwise it will be generated.
1941  */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const1942 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
1943                                       UBool escapeUnprintable) const
1944 {
1945     if (pat != NULL) {
1946         int32_t i;
1947         int32_t backslashCount = 0;
1948         for (i=0; i<patLen; ) {
1949             UChar32 c;
1950             U16_NEXT(pat, i, patLen, c);
1951             if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1952                 // If the unprintable character is preceded by an odd
1953                 // number of backslashes, then it has been escaped.
1954                 // Before unescaping it, we delete the final
1955                 // backslash.
1956                 if ((backslashCount % 2) == 1) {
1957                     result.truncate(result.length() - 1);
1958                 }
1959                 ICU_Utility::escapeUnprintable(result, c);
1960                 backslashCount = 0;
1961             } else {
1962                 result.append(c);
1963                 if (c == BACKSLASH) {
1964                     ++backslashCount;
1965                 } else {
1966                     backslashCount = 0;
1967                 }
1968             }
1969         }
1970         return result;
1971     }
1972 
1973     return _generatePattern(result, escapeUnprintable);
1974 }
1975 
1976 /**
1977  * Returns a string representation of this set.  If the result of
1978  * calling this function is passed to a UnicodeSet constructor, it
1979  * will produce another set that is equal to this one.
1980  */
toPattern(UnicodeString & result,UBool escapeUnprintable) const1981 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
1982                                      UBool escapeUnprintable) const
1983 {
1984     result.truncate(0);
1985     return _toPattern(result, escapeUnprintable);
1986 }
1987 
1988 /**
1989  * Generate and append a string representation of this set to result.
1990  * This does not use this.pat, the cleaned up copy of the string
1991  * passed to applyPattern().
1992  */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const1993 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
1994                                             UBool escapeUnprintable) const
1995 {
1996     result.append(SET_OPEN);
1997 
1998 //  // Check against the predefined categories.  We implicitly build
1999 //  // up ALL category sets the first time toPattern() is called.
2000 //  for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2001 //      if (*this == getCategorySet(cat)) {
2002 //          result.append(COLON);
2003 //          result.append(CATEGORY_NAMES, cat*2, 2);
2004 //          return result.append(CATEGORY_CLOSE);
2005 //      }
2006 //  }
2007 
2008     int32_t count = getRangeCount();
2009 
2010     // If the set contains at least 2 intervals and includes both
2011     // MIN_VALUE and MAX_VALUE, then the inverse representation will
2012     // be more economical.
2013     if (count > 1 &&
2014         getRangeStart(0) == MIN_VALUE &&
2015         getRangeEnd(count-1) == MAX_VALUE) {
2016 
2017         // Emit the inverse
2018         result.append(COMPLEMENT);
2019 
2020         for (int32_t i = 1; i < count; ++i) {
2021             UChar32 start = getRangeEnd(i-1)+1;
2022             UChar32 end = getRangeStart(i)-1;
2023             _appendToPat(result, start, escapeUnprintable);
2024             if (start != end) {
2025                 if ((start+1) != end) {
2026                     result.append(HYPHEN);
2027                 }
2028                 _appendToPat(result, end, escapeUnprintable);
2029             }
2030         }
2031     }
2032 
2033     // Default; emit the ranges as pairs
2034     else {
2035         for (int32_t i = 0; i < count; ++i) {
2036             UChar32 start = getRangeStart(i);
2037             UChar32 end = getRangeEnd(i);
2038             _appendToPat(result, start, escapeUnprintable);
2039             if (start != end) {
2040                 if ((start+1) != end) {
2041                     result.append(HYPHEN);
2042                 }
2043                 _appendToPat(result, end, escapeUnprintable);
2044             }
2045         }
2046     }
2047 
2048     for (int32_t i = 0; i<strings->size(); ++i) {
2049         result.append(OPEN_BRACE);
2050         _appendToPat(result,
2051                      *(const UnicodeString*) strings->elementAt(i),
2052                      escapeUnprintable);
2053         result.append(CLOSE_BRACE);
2054     }
2055     return result.append(SET_CLOSE);
2056 }
2057 
2058 /**
2059 * Release existing cached pattern
2060 */
releasePattern()2061 void UnicodeSet::releasePattern() {
2062     if (pat) {
2063         uprv_free(pat);
2064         pat = NULL;
2065         patLen = 0;
2066     }
2067 }
2068 
2069 /**
2070 * Set the new pattern to cache.
2071 */
setPattern(const UnicodeString & newPat)2072 void UnicodeSet::setPattern(const UnicodeString& newPat) {
2073     releasePattern();
2074     int32_t newPatLen = newPat.length();
2075     pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2076     if (pat) {
2077         patLen = newPatLen;
2078         newPat.extractBetween(0, patLen, pat);
2079         pat[patLen] = 0;
2080     }
2081     // else we don't care if malloc failed. This was just a nice cache.
2082     // We can regenerate an equivalent pattern later when requested.
2083 }
2084 
freeze()2085 UnicodeFunctor *UnicodeSet::freeze() {
2086     if(!isFrozen() && !isBogus()) {
2087         // Do most of what compact() does before freezing because
2088         // compact() will not work when the set is frozen.
2089         // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2090 
2091         // Delete buffer first to defragment memory less.
2092         if (buffer != NULL) {
2093             uprv_free(buffer);
2094             buffer = NULL;
2095         }
2096         if (capacity > (len + GROW_EXTRA)) {
2097             // Make the capacity equal to len or 1.
2098             // We don't want to realloc of 0 size.
2099             capacity = len + (len == 0);
2100             list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2101             if (list == NULL) { // Check for memory allocation error.
2102                 setToBogus();
2103                 return this;
2104             }
2105         }
2106 
2107         // Optimize contains() and span() and similar functions.
2108         if (!strings->isEmpty()) {
2109             stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2110             if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2111                 // All strings are irrelevant for span() etc. because
2112                 // all of each string's code points are contained in this set.
2113                 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2114                 // many relevant strings as UTF-16.
2115                 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2116                 delete stringSpan;
2117                 stringSpan = NULL;
2118             }
2119         }
2120         if (stringSpan == NULL) {
2121             // No span-relevant strings: Optimize for code point spans.
2122             bmpSet=new BMPSet(list, len);
2123             if (bmpSet == NULL) { // Check for memory allocation error.
2124                 setToBogus();
2125             }
2126         }
2127     }
2128     return this;
2129 }
2130 
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2131 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2132     if(length>0 && bmpSet!=NULL) {
2133         return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2134     }
2135     if(length<0) {
2136         length=u_strlen(s);
2137     }
2138     if(length==0) {
2139         return 0;
2140     }
2141     if(stringSpan!=NULL) {
2142         return stringSpan->span(s, length, spanCondition);
2143     } else if(!strings->isEmpty()) {
2144         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2145                             UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2146                             UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2147         UnicodeSetStringSpan strSpan(*this, *strings, which);
2148         if(strSpan.needsStringSpanUTF16()) {
2149             return strSpan.span(s, length, spanCondition);
2150         }
2151     }
2152 
2153     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2154         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2155     }
2156 
2157     UChar32 c;
2158     int32_t start=0, prev=0;
2159     do {
2160         U16_NEXT(s, start, length, c);
2161         if(spanCondition!=contains(c)) {
2162             break;
2163         }
2164     } while((prev=start)<length);
2165     return prev;
2166 }
2167 
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2168 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2169     if(length>0 && bmpSet!=NULL) {
2170         return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2171     }
2172     if(length<0) {
2173         length=u_strlen(s);
2174     }
2175     if(length==0) {
2176         return 0;
2177     }
2178     if(stringSpan!=NULL) {
2179         return stringSpan->spanBack(s, length, spanCondition);
2180     } else if(!strings->isEmpty()) {
2181         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2182                             UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2183                             UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2184         UnicodeSetStringSpan strSpan(*this, *strings, which);
2185         if(strSpan.needsStringSpanUTF16()) {
2186             return strSpan.spanBack(s, length, spanCondition);
2187         }
2188     }
2189 
2190     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2191         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2192     }
2193 
2194     UChar32 c;
2195     int32_t prev=length;
2196     do {
2197         U16_PREV(s, 0, length, c);
2198         if(spanCondition!=contains(c)) {
2199             break;
2200         }
2201     } while((prev=length)>0);
2202     return prev;
2203 }
2204 
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2205 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2206     if(length>0 && bmpSet!=NULL) {
2207         const uint8_t *s0=(const uint8_t *)s;
2208         return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2209     }
2210     if(length<0) {
2211         length=(int32_t)uprv_strlen(s);
2212     }
2213     if(length==0) {
2214         return 0;
2215     }
2216     if(stringSpan!=NULL) {
2217         return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2218     } else if(!strings->isEmpty()) {
2219         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2220                             UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2221                             UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2222         UnicodeSetStringSpan strSpan(*this, *strings, which);
2223         if(strSpan.needsStringSpanUTF8()) {
2224             return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2225         }
2226     }
2227 
2228     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2229         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2230     }
2231 
2232     UChar32 c;
2233     int32_t start=0, prev=0;
2234     do {
2235         U8_NEXT(s, start, length, c);
2236         if(c<0) {
2237             c=0xfffd;
2238         }
2239         if(spanCondition!=contains(c)) {
2240             break;
2241         }
2242     } while((prev=start)<length);
2243     return prev;
2244 }
2245 
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2246 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2247     if(length>0 && bmpSet!=NULL) {
2248         const uint8_t *s0=(const uint8_t *)s;
2249         return bmpSet->spanBackUTF8(s0, length, spanCondition);
2250     }
2251     if(length<0) {
2252         length=(int32_t)uprv_strlen(s);
2253     }
2254     if(length==0) {
2255         return 0;
2256     }
2257     if(stringSpan!=NULL) {
2258         return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2259     } else if(!strings->isEmpty()) {
2260         uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2261                             UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2262                             UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2263         UnicodeSetStringSpan strSpan(*this, *strings, which);
2264         if(strSpan.needsStringSpanUTF8()) {
2265             return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2266         }
2267     }
2268 
2269     if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2270         spanCondition=USET_SPAN_CONTAINED;  // Pin to 0/1 values.
2271     }
2272 
2273     UChar32 c;
2274     int32_t prev=length;
2275     do {
2276         U8_PREV(s, 0, length, c);
2277         if(c<0) {
2278             c=0xfffd;
2279         }
2280         if(spanCondition!=contains(c)) {
2281             break;
2282         }
2283     } while((prev=length)>0);
2284     return prev;
2285 }
2286 
2287 U_NAMESPACE_END
2288