1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12 #include "unicode/uniset.h"
13 #include "unicode/parsepos.h"
14 #include "unicode/symtable.h"
15 #include "ruleiter.h"
16 #include "cmemory.h"
17 #include "cstring.h"
18 #include "patternprops.h"
19 #include "uhash.h"
20 #include "util.h"
21 #include "uvector.h"
22 #include "charstr.h"
23 #include "ustrfmt.h"
24 #include "uassert.h"
25 #include "hash.h"
26 #include "bmpset.h"
27 #include "unisetspan.h"
28
29 // Define UChar constants using hex for EBCDIC compatibility
30 // Used #define to reduce private static exports and memory access time.
31 #define SET_OPEN ((UChar)0x005B) /*[*/
32 #define SET_CLOSE ((UChar)0x005D) /*]*/
33 #define HYPHEN ((UChar)0x002D) /*-*/
34 #define COMPLEMENT ((UChar)0x005E) /*^*/
35 #define COLON ((UChar)0x003A) /*:*/
36 #define BACKSLASH ((UChar)0x005C) /*\*/
37 #define INTERSECTION ((UChar)0x0026) /*&*/
38 #define UPPER_U ((UChar)0x0055) /*U*/
39 #define LOWER_U ((UChar)0x0075) /*u*/
40 #define OPEN_BRACE ((UChar)123) /*{*/
41 #define CLOSE_BRACE ((UChar)125) /*}*/
42 #define UPPER_P ((UChar)0x0050) /*P*/
43 #define LOWER_P ((UChar)0x0070) /*p*/
44 #define UPPER_N ((UChar)78) /*N*/
45 #define EQUALS ((UChar)0x003D) /*=*/
46
47 // HIGH_VALUE > all valid values. 110000 for codepoints
48 #define UNICODESET_HIGH 0x0110000
49
50 // LOW <= all valid values. ZERO for codepoints
51 #define UNICODESET_LOW 0x000000
52
53 // initial storage. Must be >= 0
54 #define START_EXTRA 16
55
56 // extra amount for growth. Must be >= 0
57 #define GROW_EXTRA START_EXTRA
58
59 U_NAMESPACE_BEGIN
60
~SymbolTable()61 SymbolTable::~SymbolTable() {}
62
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)63 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
64
65 /**
66 * Modify the given UChar32 variable so that it is in range, by
67 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
68 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
69 * It modifies its argument in-place and also returns it.
70 */
71 static inline UChar32 pinCodePoint(UChar32& c) {
72 if (c < UNICODESET_LOW) {
73 c = UNICODESET_LOW;
74 } else if (c > (UNICODESET_HIGH-1)) {
75 c = (UNICODESET_HIGH-1);
76 }
77 return c;
78 }
79
80 //----------------------------------------------------------------
81 // Debugging
82 //----------------------------------------------------------------
83
84 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
85 // To enable the debugging, define the symbol DEBUG_MEM in the line
86 // below. This will result in text being sent to stdout that looks
87 // like this:
88 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
89 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
90 // Each line lists a construction (ct) or destruction (dt) event, the
91 // object address, the number of outstanding objects after the event,
92 // and the pattern of the object in question.
93
94 // #define DEBUG_MEM
95
96 #ifdef DEBUG_MEM
97 #include <stdio.h>
98 static int32_t _dbgCount = 0;
99
_dbgct(UnicodeSet * set)100 static inline void _dbgct(UnicodeSet* set) {
101 UnicodeString str;
102 set->toPattern(str, TRUE);
103 char buf[40];
104 str.extract(0, 39, buf, "");
105 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
106 }
107
_dbgdt(UnicodeSet * set)108 static inline void _dbgdt(UnicodeSet* set) {
109 UnicodeString str;
110 set->toPattern(str, TRUE);
111 char buf[40];
112 str.extract(0, 39, buf, "");
113 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
114 }
115
116 #else
117
118 #define _dbgct(set)
119 #define _dbgdt(set)
120
121 #endif
122
123 //----------------------------------------------------------------
124 // UnicodeString in UVector support
125 //----------------------------------------------------------------
126
cloneUnicodeString(UHashTok * dst,UHashTok * src)127 static void U_CALLCONV cloneUnicodeString(UHashTok *dst, UHashTok *src) {
128 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
129 }
130
compareUnicodeString(UHashTok t1,UHashTok t2)131 static int8_t U_CALLCONV compareUnicodeString(UHashTok t1, UHashTok t2) {
132 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
133 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
134 return a.compare(b);
135 }
136
137 //----------------------------------------------------------------
138 // Constructors &c
139 //----------------------------------------------------------------
140
141 /**
142 * Constructs an empty set.
143 */
UnicodeSet()144 UnicodeSet::UnicodeSet() :
145 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
146 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
147 fFlags(0)
148 {
149 UErrorCode status = U_ZERO_ERROR;
150 allocateStrings(status);
151 if (U_FAILURE(status)) {
152 return;
153 }
154 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
155 if(list!=NULL){
156 list[0] = UNICODESET_HIGH;
157 } else { // If memory allocation failed, set to bogus state.
158 setToBogus();
159 return;
160 }
161 _dbgct(this);
162 }
163
164 /**
165 * Constructs a set containing the given range. If <code>end >
166 * start</code> then an empty set is created.
167 *
168 * @param start first character, inclusive, of range
169 * @param end last character, inclusive, of range
170 */
UnicodeSet(UChar32 start,UChar32 end)171 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
172 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
173 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
174 fFlags(0)
175 {
176 UErrorCode status = U_ZERO_ERROR;
177 allocateStrings(status);
178 if (U_FAILURE(status)) {
179 return;
180 }
181 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
182 if(list!=NULL){
183 list[0] = UNICODESET_HIGH;
184 complement(start, end);
185 } else { // If memory allocation failed, set to bogus state.
186 setToBogus();
187 return;
188 }
189 _dbgct(this);
190 }
191
192 /**
193 * Constructs a set that is identical to the given UnicodeSet.
194 */
UnicodeSet(const UnicodeSet & o)195 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
196 UnicodeFilter(o),
197 len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
198 bmpSet(0),
199 buffer(0), bufferCapacity(0),
200 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
201 fFlags(0)
202 {
203 UErrorCode status = U_ZERO_ERROR;
204 allocateStrings(status);
205 if (U_FAILURE(status)) {
206 return;
207 }
208 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
209 if(list!=NULL){
210 *this = o;
211 } else { // If memory allocation failed, set to bogus state.
212 setToBogus();
213 return;
214 }
215 _dbgct(this);
216 }
217
218 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)219 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
220 UnicodeFilter(o),
221 len(0), capacity(o.len + GROW_EXTRA), list(0),
222 bmpSet(0),
223 buffer(0), bufferCapacity(0),
224 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
225 fFlags(0)
226 {
227 UErrorCode status = U_ZERO_ERROR;
228 allocateStrings(status);
229 if (U_FAILURE(status)) {
230 return;
231 }
232 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
233 if(list!=NULL){
234 // *this = o except for bmpSet and stringSpan
235 len = o.len;
236 uprv_memcpy(list, o.list, len*sizeof(UChar32));
237 if (strings != NULL && o.strings != NULL) {
238 strings->assign(*o.strings, cloneUnicodeString, status);
239 } else { // Invalid strings.
240 setToBogus();
241 return;
242 }
243 if (o.pat) {
244 setPattern(UnicodeString(o.pat, o.patLen));
245 }
246 } else { // If memory allocation failed, set to bogus state.
247 setToBogus();
248 return;
249 }
250 _dbgct(this);
251 }
252
253 /**
254 * Destructs the set.
255 */
~UnicodeSet()256 UnicodeSet::~UnicodeSet() {
257 _dbgdt(this); // first!
258 uprv_free(list);
259 delete bmpSet;
260 if (buffer) {
261 uprv_free(buffer);
262 }
263 delete strings;
264 delete stringSpan;
265 releasePattern();
266 }
267
268 /**
269 * Assigns this object to be a copy of another.
270 */
operator =(const UnicodeSet & o)271 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
272 if (this == &o) {
273 return *this;
274 }
275 if (isFrozen()) {
276 return *this;
277 }
278 if (o.isBogus()) {
279 setToBogus();
280 return *this;
281 }
282 UErrorCode ec = U_ZERO_ERROR;
283 ensureCapacity(o.len, ec);
284 if (U_FAILURE(ec)) {
285 return *this; // There is no way to report this error :-(
286 }
287 len = o.len;
288 uprv_memcpy(list, o.list, len*sizeof(UChar32));
289 if (o.bmpSet == NULL) {
290 bmpSet = NULL;
291 } else {
292 bmpSet = new BMPSet(*o.bmpSet, list, len);
293 if (bmpSet == NULL) { // Check for memory allocation error.
294 setToBogus();
295 return *this;
296 }
297 }
298 if (strings != NULL && o.strings != NULL) {
299 strings->assign(*o.strings, cloneUnicodeString, ec);
300 } else { // Invalid strings.
301 setToBogus();
302 return *this;
303 }
304 if (o.stringSpan == NULL) {
305 stringSpan = NULL;
306 } else {
307 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
308 if (stringSpan == NULL) { // Check for memory allocation error.
309 setToBogus();
310 return *this;
311 }
312 }
313 releasePattern();
314 if (o.pat) {
315 setPattern(UnicodeString(o.pat, o.patLen));
316 }
317 return *this;
318 }
319
320 /**
321 * Returns a copy of this object. All UnicodeMatcher objects have
322 * to support cloning in order to allow classes using
323 * UnicodeMatchers, such as Transliterator, to implement cloning.
324 */
clone() const325 UnicodeFunctor* UnicodeSet::clone() const {
326 return new UnicodeSet(*this);
327 }
328
cloneAsThawed() const329 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
330 return new UnicodeSet(*this, TRUE);
331 }
332
333 /**
334 * Compares the specified object with this set for equality. Returns
335 * <tt>true</tt> if the two sets
336 * have the same size, and every member of the specified set is
337 * contained in this set (or equivalently, every member of this set is
338 * contained in the specified set).
339 *
340 * @param o set to be compared for equality with this set.
341 * @return <tt>true</tt> if the specified set is equal to this set.
342 */
operator ==(const UnicodeSet & o) const343 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
344 if (len != o.len) return FALSE;
345 for (int32_t i = 0; i < len; ++i) {
346 if (list[i] != o.list[i]) return FALSE;
347 }
348 if (*strings != *o.strings) return FALSE;
349 return TRUE;
350 }
351
352 /**
353 * Returns the hash code value for this set.
354 *
355 * @return the hash code value for this set.
356 * @see Object#hashCode()
357 */
hashCode(void) const358 int32_t UnicodeSet::hashCode(void) const {
359 int32_t result = len;
360 for (int32_t i = 0; i < len; ++i) {
361 result *= 1000003;
362 result += list[i];
363 }
364 return result;
365 }
366
367 //----------------------------------------------------------------
368 // Public API
369 //----------------------------------------------------------------
370
371 /**
372 * Returns the number of elements in this set (its cardinality),
373 * Note than the elements of a set may include both individual
374 * codepoints and strings.
375 *
376 * @return the number of elements in this set (its cardinality).
377 */
size(void) const378 int32_t UnicodeSet::size(void) const {
379 int32_t n = 0;
380 int32_t count = getRangeCount();
381 for (int32_t i = 0; i < count; ++i) {
382 n += getRangeEnd(i) - getRangeStart(i) + 1;
383 }
384 return n + strings->size();
385 }
386
387 /**
388 * Returns <tt>true</tt> if this set contains no elements.
389 *
390 * @return <tt>true</tt> if this set contains no elements.
391 */
isEmpty(void) const392 UBool UnicodeSet::isEmpty(void) const {
393 return len == 1 && strings->size() == 0;
394 }
395
396 /**
397 * Returns true if this set contains the given character.
398 * @param c character to be checked for containment
399 * @return true if the test condition is met
400 */
contains(UChar32 c) const401 UBool UnicodeSet::contains(UChar32 c) const {
402 // Set i to the index of the start item greater than ch
403 // We know we will terminate without length test!
404 // LATER: for large sets, add binary search
405 //int32_t i = -1;
406 //for (;;) {
407 // if (c < list[++i]) break;
408 //}
409 if (bmpSet != NULL) {
410 return bmpSet->contains(c);
411 }
412 if (stringSpan != NULL) {
413 return stringSpan->contains(c);
414 }
415 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
416 return FALSE;
417 }
418 int32_t i = findCodePoint(c);
419 return (UBool)(i & 1); // return true if odd
420 }
421
422 /**
423 * Returns the smallest value i such that c < list[i]. Caller
424 * must ensure that c is a legal value or this method will enter
425 * an infinite loop. This method performs a binary search.
426 * @param c a character in the range MIN_VALUE..MAX_VALUE
427 * inclusive
428 * @return the smallest integer i in the range 0..len-1,
429 * inclusive, such that c < list[i]
430 */
findCodePoint(UChar32 c) const431 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
432 /* Examples:
433 findCodePoint(c)
434 set list[] c=0 1 3 4 7 8
435 === ============== ===========
436 [] [110000] 0 0 0 0 0 0
437 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
438 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
439 [:Any:] [0, 110000] 1 1 1 1 1 1
440 */
441
442 // Return the smallest i such that c < list[i]. Assume
443 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
444 if (c < list[0])
445 return 0;
446 // High runner test. c is often after the last range, so an
447 // initial check for this condition pays off.
448 int32_t lo = 0;
449 int32_t hi = len - 1;
450 if (lo >= hi || c >= list[hi-1])
451 return hi;
452 // invariant: c >= list[lo]
453 // invariant: c < list[hi]
454 for (;;) {
455 int32_t i = (lo + hi) >> 1;
456 if (i == lo) {
457 break; // Found!
458 } else if (c < list[i]) {
459 hi = i;
460 } else {
461 lo = i;
462 }
463 }
464 return hi;
465 }
466
467 /**
468 * Returns true if this set contains every character
469 * of the given range.
470 * @param start first character, inclusive, of the range
471 * @param end last character, inclusive, of the range
472 * @return true if the test condition is met
473 */
contains(UChar32 start,UChar32 end) const474 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
475 //int32_t i = -1;
476 //for (;;) {
477 // if (start < list[++i]) break;
478 //}
479 int32_t i = findCodePoint(start);
480 return ((i & 1) != 0 && end < list[i]);
481 }
482
483 /**
484 * Returns <tt>true</tt> if this set contains the given
485 * multicharacter string.
486 * @param s string to be checked for containment
487 * @return <tt>true</tt> if this set contains the specified string
488 */
contains(const UnicodeString & s) const489 UBool UnicodeSet::contains(const UnicodeString& s) const {
490 if (s.length() == 0) return FALSE;
491 int32_t cp = getSingleCP(s);
492 if (cp < 0) {
493 return strings->contains((void*) &s);
494 } else {
495 return contains((UChar32) cp);
496 }
497 }
498
499 /**
500 * Returns true if this set contains all the characters and strings
501 * of the given set.
502 * @param c set to be checked for containment
503 * @return true if the test condition is met
504 */
containsAll(const UnicodeSet & c) const505 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
506 // The specified set is a subset if all of its pairs are contained in
507 // this set. It's possible to code this more efficiently in terms of
508 // direct manipulation of the inversion lists if the need arises.
509 int32_t n = c.getRangeCount();
510 for (int i=0; i<n; ++i) {
511 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
512 return FALSE;
513 }
514 }
515 if (!strings->containsAll(*c.strings)) return FALSE;
516 return TRUE;
517 }
518
519 /**
520 * Returns true if this set contains all the characters
521 * of the given string.
522 * @param s string containing characters to be checked for containment
523 * @return true if the test condition is met
524 */
containsAll(const UnicodeString & s) const525 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
526 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
527 s.length());
528 }
529
530 /**
531 * Returns true if this set contains none of the characters
532 * of the given range.
533 * @param start first character, inclusive, of the range
534 * @param end last character, inclusive, of the range
535 * @return true if the test condition is met
536 */
containsNone(UChar32 start,UChar32 end) const537 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
538 //int32_t i = -1;
539 //for (;;) {
540 // if (start < list[++i]) break;
541 //}
542 int32_t i = findCodePoint(start);
543 return ((i & 1) == 0 && end < list[i]);
544 }
545
546 /**
547 * Returns true if this set contains none of the characters and strings
548 * of the given set.
549 * @param c set to be checked for containment
550 * @return true if the test condition is met
551 */
containsNone(const UnicodeSet & c) const552 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
553 // The specified set is a subset if all of its pairs are contained in
554 // this set. It's possible to code this more efficiently in terms of
555 // direct manipulation of the inversion lists if the need arises.
556 int32_t n = c.getRangeCount();
557 for (int32_t i=0; i<n; ++i) {
558 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
559 return FALSE;
560 }
561 }
562 if (!strings->containsNone(*c.strings)) return FALSE;
563 return TRUE;
564 }
565
566 /**
567 * Returns true if this set contains none of the characters
568 * of the given string.
569 * @param s string containing characters to be checked for containment
570 * @return true if the test condition is met
571 */
containsNone(const UnicodeString & s) const572 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
573 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
574 s.length());
575 }
576
577 /**
578 * Returns <tt>true</tt> if this set contains any character whose low byte
579 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
580 * indexing.
581 */
matchesIndexValue(uint8_t v) const582 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
583 /* The index value v, in the range [0,255], is contained in this set if
584 * it is contained in any pair of this set. Pairs either have the high
585 * bytes equal, or unequal. If the high bytes are equal, then we have
586 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
587 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
588 * Then v is contained if xx <= v || v <= yy. (This is identical to the
589 * time zone month containment logic.)
590 */
591 int32_t i;
592 int32_t rangeCount=getRangeCount();
593 for (i=0; i<rangeCount; ++i) {
594 UChar32 low = getRangeStart(i);
595 UChar32 high = getRangeEnd(i);
596 if ((low & ~0xFF) == (high & ~0xFF)) {
597 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
598 return TRUE;
599 }
600 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
601 return TRUE;
602 }
603 }
604 if (strings->size() != 0) {
605 for (i=0; i<strings->size(); ++i) {
606 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
607 //if (s.length() == 0) {
608 // // Empty strings match everything
609 // return TRUE;
610 //}
611 // assert(s.length() != 0); // We enforce this elsewhere
612 UChar32 c = s.char32At(0);
613 if ((c & 0xFF) == v) {
614 return TRUE;
615 }
616 }
617 }
618 return FALSE;
619 }
620
621 /**
622 * Implementation of UnicodeMatcher::matches(). Always matches the
623 * longest possible multichar string.
624 */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)625 UMatchDegree UnicodeSet::matches(const Replaceable& text,
626 int32_t& offset,
627 int32_t limit,
628 UBool incremental) {
629 if (offset == limit) {
630 // Strings, if any, have length != 0, so we don't worry
631 // about them here. If we ever allow zero-length strings
632 // we much check for them here.
633 if (contains(U_ETHER)) {
634 return incremental ? U_PARTIAL_MATCH : U_MATCH;
635 } else {
636 return U_MISMATCH;
637 }
638 } else {
639 if (strings->size() != 0) { // try strings first
640
641 // might separate forward and backward loops later
642 // for now they are combined
643
644 // TODO Improve efficiency of this, at least in the forward
645 // direction, if not in both. In the forward direction we
646 // can assume the strings are sorted.
647
648 int32_t i;
649 UBool forward = offset < limit;
650
651 // firstChar is the leftmost char to match in the
652 // forward direction or the rightmost char to match in
653 // the reverse direction.
654 UChar firstChar = text.charAt(offset);
655
656 // If there are multiple strings that can match we
657 // return the longest match.
658 int32_t highWaterLength = 0;
659
660 for (i=0; i<strings->size(); ++i) {
661 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
662
663 //if (trial.length() == 0) {
664 // return U_MATCH; // null-string always matches
665 //}
666 // assert(trial.length() != 0); // We ensure this elsewhere
667
668 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
669
670 // Strings are sorted, so we can optimize in the
671 // forward direction.
672 if (forward && c > firstChar) break;
673 if (c != firstChar) continue;
674
675 int32_t matchLen = matchRest(text, offset, limit, trial);
676
677 if (incremental) {
678 int32_t maxLen = forward ? limit-offset : offset-limit;
679 if (matchLen == maxLen) {
680 // We have successfully matched but only up to limit.
681 return U_PARTIAL_MATCH;
682 }
683 }
684
685 if (matchLen == trial.length()) {
686 // We have successfully matched the whole string.
687 if (matchLen > highWaterLength) {
688 highWaterLength = matchLen;
689 }
690 // In the forward direction we know strings
691 // are sorted so we can bail early.
692 if (forward && matchLen < highWaterLength) {
693 break;
694 }
695 continue;
696 }
697 }
698
699 // We've checked all strings without a partial match.
700 // If we have full matches, return the longest one.
701 if (highWaterLength != 0) {
702 offset += forward ? highWaterLength : -highWaterLength;
703 return U_MATCH;
704 }
705 }
706 return UnicodeFilter::matches(text, offset, limit, incremental);
707 }
708 }
709
710 /**
711 * Returns the longest match for s in text at the given position.
712 * If limit > start then match forward from start+1 to limit
713 * matching all characters except s.charAt(0). If limit < start,
714 * go backward starting from start-1 matching all characters
715 * except s.charAt(s.length()-1). This method assumes that the
716 * first character, text.charAt(start), matches s, so it does not
717 * check it.
718 * @param text the text to match
719 * @param start the first character to match. In the forward
720 * direction, text.charAt(start) is matched against s.charAt(0).
721 * In the reverse direction, it is matched against
722 * s.charAt(s.length()-1).
723 * @param limit the limit offset for matching, either last+1 in
724 * the forward direction, or last-1 in the reverse direction,
725 * where last is the index of the last character to match.
726 * @return If part of s matches up to the limit, return |limit -
727 * start|. If all of s matches before reaching the limit, return
728 * s.length(). If there is a mismatch between s and text, return
729 * 0
730 */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)731 int32_t UnicodeSet::matchRest(const Replaceable& text,
732 int32_t start, int32_t limit,
733 const UnicodeString& s) {
734 int32_t i;
735 int32_t maxLen;
736 int32_t slen = s.length();
737 if (start < limit) {
738 maxLen = limit - start;
739 if (maxLen > slen) maxLen = slen;
740 for (i = 1; i < maxLen; ++i) {
741 if (text.charAt(start + i) != s.charAt(i)) return 0;
742 }
743 } else {
744 maxLen = start - limit;
745 if (maxLen > slen) maxLen = slen;
746 --slen; // <=> slen = s.length() - 1;
747 for (i = 1; i < maxLen; ++i) {
748 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
749 }
750 }
751 return maxLen;
752 }
753
754 /**
755 * Implement of UnicodeMatcher
756 */
addMatchSetTo(UnicodeSet & toUnionTo) const757 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
758 toUnionTo.addAll(*this);
759 }
760
761 /**
762 * Returns the index of the given character within this set, where
763 * the set is ordered by ascending code point. If the character
764 * is not in this set, return -1. The inverse of this method is
765 * <code>charAt()</code>.
766 * @return an index from 0..size()-1, or -1
767 */
indexOf(UChar32 c) const768 int32_t UnicodeSet::indexOf(UChar32 c) const {
769 if (c < MIN_VALUE || c > MAX_VALUE) {
770 return -1;
771 }
772 int32_t i = 0;
773 int32_t n = 0;
774 for (;;) {
775 UChar32 start = list[i++];
776 if (c < start) {
777 return -1;
778 }
779 UChar32 limit = list[i++];
780 if (c < limit) {
781 return n + c - start;
782 }
783 n += limit - start;
784 }
785 }
786
787 /**
788 * Returns the character at the given index within this set, where
789 * the set is ordered by ascending code point. If the index is
790 * out of range, return (UChar32)-1. The inverse of this method is
791 * <code>indexOf()</code>.
792 * @param index an index from 0..size()-1
793 * @return the character at the given index, or (UChar32)-1.
794 */
charAt(int32_t index) const795 UChar32 UnicodeSet::charAt(int32_t index) const {
796 if (index >= 0) {
797 // len2 is the largest even integer <= len, that is, it is len
798 // for even values and len-1 for odd values. With odd values
799 // the last entry is UNICODESET_HIGH.
800 int32_t len2 = len & ~1;
801 for (int32_t i=0; i < len2;) {
802 UChar32 start = list[i++];
803 int32_t count = list[i++] - start;
804 if (index < count) {
805 return (UChar32)(start + index);
806 }
807 index -= count;
808 }
809 }
810 return (UChar32)-1;
811 }
812
813 /**
814 * Make this object represent the range <code>start - end</code>.
815 * If <code>end > start</code> then this object is set to an
816 * an empty range.
817 *
818 * @param start first character in the set, inclusive
819 * @rparam end last character in the set, inclusive
820 */
set(UChar32 start,UChar32 end)821 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
822 clear();
823 complement(start, end);
824 return *this;
825 }
826
827 /**
828 * Adds the specified range to this set if it is not already
829 * present. If this set already contains the specified range,
830 * the call leaves this set unchanged. If <code>end > start</code>
831 * then an empty range is added, leaving the set unchanged.
832 *
833 * @param start first character, inclusive, of range to be added
834 * to this set.
835 * @param end last character, inclusive, of range to be added
836 * to this set.
837 */
add(UChar32 start,UChar32 end)838 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
839 if (pinCodePoint(start) < pinCodePoint(end)) {
840 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
841 add(range, 2, 0);
842 } else if (start == end) {
843 add(start);
844 }
845 return *this;
846 }
847
848 // #define DEBUG_US_ADD
849
850 #ifdef DEBUG_US_ADD
851 #include <stdio.h>
dump(UChar32 c)852 void dump(UChar32 c) {
853 if (c <= 0xFF) {
854 printf("%c", (char)c);
855 } else {
856 printf("U+%04X", c);
857 }
858 }
dump(const UChar32 * list,int32_t len)859 void dump(const UChar32* list, int32_t len) {
860 printf("[");
861 for (int32_t i=0; i<len; ++i) {
862 if (i != 0) printf(", ");
863 dump(list[i]);
864 }
865 printf("]");
866 }
867 #endif
868
869 /**
870 * Adds the specified character to this set if it is not already
871 * present. If this set already contains the specified character,
872 * the call leaves this set unchanged.
873 */
add(UChar32 c)874 UnicodeSet& UnicodeSet::add(UChar32 c) {
875 // find smallest i such that c < list[i]
876 // if odd, then it is IN the set
877 // if even, then it is OUT of the set
878 int32_t i = findCodePoint(pinCodePoint(c));
879
880 // already in set?
881 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
882
883 // HIGH is 0x110000
884 // assert(list[len-1] == HIGH);
885
886 // empty = [HIGH]
887 // [start_0, limit_0, start_1, limit_1, HIGH]
888
889 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
890 // ^
891 // list[i]
892
893 // i == 0 means c is before the first range
894
895 #ifdef DEBUG_US_ADD
896 printf("Add of ");
897 dump(c);
898 printf(" found at %d", i);
899 printf(": ");
900 dump(list, len);
901 printf(" => ");
902 #endif
903
904 if (c == list[i]-1) {
905 // c is before start of next range
906 list[i] = c;
907 // if we touched the HIGH mark, then add a new one
908 if (c == (UNICODESET_HIGH - 1)) {
909 UErrorCode status = U_ZERO_ERROR;
910 ensureCapacity(len+1, status);
911 if (U_FAILURE(status)) {
912 return *this; // There is no way to report this error :-(
913 }
914 list[len++] = UNICODESET_HIGH;
915 }
916 if (i > 0 && c == list[i-1]) {
917 // collapse adjacent ranges
918
919 // [..., start_k-1, c, c, limit_k, ..., HIGH]
920 // ^
921 // list[i]
922
923 //for (int32_t k=i-1; k<len-2; ++k) {
924 // list[k] = list[k+2];
925 //}
926 UChar32* dst = list + i - 1;
927 UChar32* src = dst + 2;
928 UChar32* srclimit = list + len;
929 while (src < srclimit) *(dst++) = *(src++);
930
931 len -= 2;
932 }
933 }
934
935 else if (i > 0 && c == list[i-1]) {
936 // c is after end of prior range
937 list[i-1]++;
938 // no need to check for collapse here
939 }
940
941 else {
942 // At this point we know the new char is not adjacent to
943 // any existing ranges, and it is not 10FFFF.
944
945
946 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
947 // ^
948 // list[i]
949
950 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
951 // ^
952 // list[i]
953
954 UErrorCode status = U_ZERO_ERROR;
955 ensureCapacity(len+2, status);
956 if (U_FAILURE(status)) {
957 return *this; // There is no way to report this error :-(
958 }
959
960 //for (int32_t k=len-1; k>=i; --k) {
961 // list[k+2] = list[k];
962 //}
963 UChar32* src = list + len;
964 UChar32* dst = src + 2;
965 UChar32* srclimit = list + i;
966 while (src > srclimit) *(--dst) = *(--src);
967
968 list[i] = c;
969 list[i+1] = c+1;
970 len += 2;
971 }
972
973 #ifdef DEBUG_US_ADD
974 dump(list, len);
975 printf("\n");
976
977 for (i=1; i<len; ++i) {
978 if (list[i] <= list[i-1]) {
979 // Corrupt array!
980 printf("ERROR: list has been corrupted\n");
981 exit(1);
982 }
983 }
984 #endif
985
986 releasePattern();
987 return *this;
988 }
989
990 /**
991 * Adds the specified multicharacter to this set if it is not already
992 * present. If this set already contains the multicharacter,
993 * the call leaves this set unchanged.
994 * Thus "ch" => {"ch"}
995 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
996 * @param s the source string
997 * @return the modified set, for chaining
998 */
add(const UnicodeString & s)999 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
1000 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1001 int32_t cp = getSingleCP(s);
1002 if (cp < 0) {
1003 if (!strings->contains((void*) &s)) {
1004 _add(s);
1005 releasePattern();
1006 }
1007 } else {
1008 add((UChar32)cp);
1009 }
1010 return *this;
1011 }
1012
1013 /**
1014 * Adds the given string, in order, to 'strings'. The given string
1015 * must have been checked by the caller to not be empty and to not
1016 * already be in 'strings'.
1017 */
_add(const UnicodeString & s)1018 void UnicodeSet::_add(const UnicodeString& s) {
1019 if (isFrozen() || isBogus()) {
1020 return;
1021 }
1022 UnicodeString* t = new UnicodeString(s);
1023 if (t == NULL) { // Check for memory allocation error.
1024 setToBogus();
1025 return;
1026 }
1027 UErrorCode ec = U_ZERO_ERROR;
1028 strings->sortedInsert(t, compareUnicodeString, ec);
1029 if (U_FAILURE(ec)) {
1030 setToBogus();
1031 delete t;
1032 }
1033 }
1034
1035 /**
1036 * @return a code point IF the string consists of a single one.
1037 * otherwise returns -1.
1038 * @param string to test
1039 */
getSingleCP(const UnicodeString & s)1040 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1041 //if (s.length() < 1) {
1042 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1043 //}
1044 if (s.length() > 2) return -1;
1045 if (s.length() == 1) return s.charAt(0);
1046
1047 // at this point, len = 2
1048 UChar32 cp = s.char32At(0);
1049 if (cp > 0xFFFF) { // is surrogate pair
1050 return cp;
1051 }
1052 return -1;
1053 }
1054
1055 /**
1056 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1057 * If this set already any particular character, it has no effect on that character.
1058 * @param the source string
1059 * @return the modified set, for chaining
1060 */
addAll(const UnicodeString & s)1061 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1062 UChar32 cp;
1063 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
1064 cp = s.char32At(i);
1065 add(cp);
1066 }
1067 return *this;
1068 }
1069
1070 /**
1071 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1072 * If this set already any particular character, it has no effect on that character.
1073 * @param the source string
1074 * @return the modified set, for chaining
1075 */
retainAll(const UnicodeString & s)1076 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1077 UnicodeSet set;
1078 set.addAll(s);
1079 retainAll(set);
1080 return *this;
1081 }
1082
1083 /**
1084 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1085 * If this set already any particular character, it has no effect on that character.
1086 * @param the source string
1087 * @return the modified set, for chaining
1088 */
complementAll(const UnicodeString & s)1089 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1090 UnicodeSet set;
1091 set.addAll(s);
1092 complementAll(set);
1093 return *this;
1094 }
1095
1096 /**
1097 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1098 * If this set already any particular character, it has no effect on that character.
1099 * @param the source string
1100 * @return the modified set, for chaining
1101 */
removeAll(const UnicodeString & s)1102 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1103 UnicodeSet set;
1104 set.addAll(s);
1105 removeAll(set);
1106 return *this;
1107 }
1108
removeAllStrings()1109 UnicodeSet& UnicodeSet::removeAllStrings() {
1110 strings->removeAllElements();
1111 return *this;
1112 }
1113
1114
1115 /**
1116 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1117 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1118 * @param the source string
1119 * @return a newly created set containing the given string
1120 */
createFrom(const UnicodeString & s)1121 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1122 UnicodeSet *set = new UnicodeSet();
1123 if (set != NULL) { // Check for memory allocation error.
1124 set->add(s);
1125 }
1126 return set;
1127 }
1128
1129
1130 /**
1131 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1132 * @param the source string
1133 * @return a newly created set containing the given characters
1134 */
createFromAll(const UnicodeString & s)1135 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1136 UnicodeSet *set = new UnicodeSet();
1137 if (set != NULL) { // Check for memory allocation error.
1138 set->addAll(s);
1139 }
1140 return set;
1141 }
1142
1143 /**
1144 * Retain only the elements in this set that are contained in the
1145 * specified range. If <code>end > start</code> then an empty range is
1146 * retained, leaving the set empty.
1147 *
1148 * @param start first character, inclusive, of range to be retained
1149 * to this set.
1150 * @param end last character, inclusive, of range to be retained
1151 * to this set.
1152 */
retain(UChar32 start,UChar32 end)1153 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1154 if (pinCodePoint(start) <= pinCodePoint(end)) {
1155 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1156 retain(range, 2, 0);
1157 } else {
1158 clear();
1159 }
1160 return *this;
1161 }
1162
retain(UChar32 c)1163 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1164 return retain(c, c);
1165 }
1166
1167 /**
1168 * Removes the specified range from this set if it is present.
1169 * The set will not contain the specified range once the call
1170 * returns. If <code>end > start</code> then an empty range is
1171 * removed, leaving the set unchanged.
1172 *
1173 * @param start first character, inclusive, of range to be removed
1174 * from this set.
1175 * @param end last character, inclusive, of range to be removed
1176 * from this set.
1177 */
remove(UChar32 start,UChar32 end)1178 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1179 if (pinCodePoint(start) <= pinCodePoint(end)) {
1180 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1181 retain(range, 2, 2);
1182 }
1183 return *this;
1184 }
1185
1186 /**
1187 * Removes the specified character from this set if it is present.
1188 * The set will not contain the specified range once the call
1189 * returns.
1190 */
remove(UChar32 c)1191 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1192 return remove(c, c);
1193 }
1194
1195 /**
1196 * Removes the specified string from this set if it is present.
1197 * The set will not contain the specified character once the call
1198 * returns.
1199 * @param the source string
1200 * @return the modified set, for chaining
1201 */
remove(const UnicodeString & s)1202 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1203 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1204 int32_t cp = getSingleCP(s);
1205 if (cp < 0) {
1206 strings->removeElement((void*) &s);
1207 releasePattern();
1208 } else {
1209 remove((UChar32)cp, (UChar32)cp);
1210 }
1211 return *this;
1212 }
1213
1214 /**
1215 * Complements the specified range in this set. Any character in
1216 * the range will be removed if it is in this set, or will be
1217 * added if it is not in this set. If <code>end > start</code>
1218 * then an empty range is xor'ed, leaving the set unchanged.
1219 *
1220 * @param start first character, inclusive, of range to be removed
1221 * from this set.
1222 * @param end last character, inclusive, of range to be removed
1223 * from this set.
1224 */
complement(UChar32 start,UChar32 end)1225 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1226 if (isFrozen() || isBogus()) {
1227 return *this;
1228 }
1229 if (pinCodePoint(start) <= pinCodePoint(end)) {
1230 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1231 exclusiveOr(range, 2, 0);
1232 }
1233 releasePattern();
1234 return *this;
1235 }
1236
complement(UChar32 c)1237 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1238 return complement(c, c);
1239 }
1240
1241 /**
1242 * This is equivalent to
1243 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1244 */
complement(void)1245 UnicodeSet& UnicodeSet::complement(void) {
1246 if (isFrozen() || isBogus()) {
1247 return *this;
1248 }
1249 UErrorCode status = U_ZERO_ERROR;
1250 if (list[0] == UNICODESET_LOW) {
1251 ensureBufferCapacity(len-1, status);
1252 if (U_FAILURE(status)) {
1253 return *this;
1254 }
1255 uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1256 --len;
1257 } else {
1258 ensureBufferCapacity(len+1, status);
1259 if (U_FAILURE(status)) {
1260 return *this;
1261 }
1262 uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1263 buffer[0] = UNICODESET_LOW;
1264 ++len;
1265 }
1266 swapBuffers();
1267 releasePattern();
1268 return *this;
1269 }
1270
1271 /**
1272 * Complement the specified string in this set.
1273 * The set will not contain the specified string once the call
1274 * returns.
1275 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1276 * @param s the string to complement
1277 * @return this object, for chaining
1278 */
complement(const UnicodeString & s)1279 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1280 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1281 int32_t cp = getSingleCP(s);
1282 if (cp < 0) {
1283 if (strings->contains((void*) &s)) {
1284 strings->removeElement((void*) &s);
1285 } else {
1286 _add(s);
1287 }
1288 releasePattern();
1289 } else {
1290 complement((UChar32)cp, (UChar32)cp);
1291 }
1292 return *this;
1293 }
1294
1295 /**
1296 * Adds all of the elements in the specified set to this set if
1297 * they're not already present. This operation effectively
1298 * modifies this set so that its value is the <i>union</i> of the two
1299 * sets. The behavior of this operation is unspecified if the specified
1300 * collection is modified while the operation is in progress.
1301 *
1302 * @param c set whose elements are to be added to this set.
1303 * @see #add(char, char)
1304 */
addAll(const UnicodeSet & c)1305 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1306 if ( c.len>0 && c.list!=NULL ) {
1307 add(c.list, c.len, 0);
1308 }
1309
1310 // Add strings in order
1311 if ( c.strings!=NULL ) {
1312 for (int32_t i=0; i<c.strings->size(); ++i) {
1313 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1314 if (!strings->contains((void*) s)) {
1315 _add(*s);
1316 }
1317 }
1318 }
1319 return *this;
1320 }
1321
1322 /**
1323 * Retains only the elements in this set that are contained in the
1324 * specified set. In other words, removes from this set all of
1325 * its elements that are not contained in the specified set. This
1326 * operation effectively modifies this set so that its value is
1327 * the <i>intersection</i> of the two sets.
1328 *
1329 * @param c set that defines which elements this set will retain.
1330 */
retainAll(const UnicodeSet & c)1331 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1332 if (isFrozen() || isBogus()) {
1333 return *this;
1334 }
1335 retain(c.list, c.len, 0);
1336 strings->retainAll(*c.strings);
1337 return *this;
1338 }
1339
1340 /**
1341 * Removes from this set all of its elements that are contained in the
1342 * specified set. This operation effectively modifies this
1343 * set so that its value is the <i>asymmetric set difference</i> of
1344 * the two sets.
1345 *
1346 * @param c set that defines which elements will be removed from
1347 * this set.
1348 */
removeAll(const UnicodeSet & c)1349 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1350 if (isFrozen() || isBogus()) {
1351 return *this;
1352 }
1353 retain(c.list, c.len, 2);
1354 strings->removeAll(*c.strings);
1355 return *this;
1356 }
1357
1358 /**
1359 * Complements in this set all elements contained in the specified
1360 * set. Any character in the other set will be removed if it is
1361 * in this set, or will be added if it is not in this set.
1362 *
1363 * @param c set that defines which elements will be xor'ed from
1364 * this set.
1365 */
complementAll(const UnicodeSet & c)1366 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1367 if (isFrozen() || isBogus()) {
1368 return *this;
1369 }
1370 exclusiveOr(c.list, c.len, 0);
1371
1372 for (int32_t i=0; i<c.strings->size(); ++i) {
1373 void* e = c.strings->elementAt(i);
1374 if (!strings->removeElement(e)) {
1375 _add(*(const UnicodeString*)e);
1376 }
1377 }
1378 return *this;
1379 }
1380
1381 /**
1382 * Removes all of the elements from this set. This set will be
1383 * empty after this call returns.
1384 */
clear(void)1385 UnicodeSet& UnicodeSet::clear(void) {
1386 if (isFrozen()) {
1387 return *this;
1388 }
1389 if (list != NULL) {
1390 list[0] = UNICODESET_HIGH;
1391 }
1392 len = 1;
1393 releasePattern();
1394 if (strings != NULL) {
1395 strings->removeAllElements();
1396 }
1397 if (list != NULL && strings != NULL) {
1398 // Remove bogus
1399 fFlags = 0;
1400 }
1401 return *this;
1402 }
1403
1404 /**
1405 * Iteration method that returns the number of ranges contained in
1406 * this set.
1407 * @see #getRangeStart
1408 * @see #getRangeEnd
1409 */
getRangeCount() const1410 int32_t UnicodeSet::getRangeCount() const {
1411 return len/2;
1412 }
1413
1414 /**
1415 * Iteration method that returns the first character in the
1416 * specified range of this set.
1417 * @see #getRangeCount
1418 * @see #getRangeEnd
1419 */
getRangeStart(int32_t index) const1420 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1421 return list[index*2];
1422 }
1423
1424 /**
1425 * Iteration method that returns the last character in the
1426 * specified range of this set.
1427 * @see #getRangeStart
1428 * @see #getRangeEnd
1429 */
getRangeEnd(int32_t index) const1430 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1431 return list[index*2 + 1] - 1;
1432 }
1433
getStringCount() const1434 int32_t UnicodeSet::getStringCount() const {
1435 return strings->size();
1436 }
1437
getString(int32_t index) const1438 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1439 return (const UnicodeString*) strings->elementAt(index);
1440 }
1441
1442 /**
1443 * Reallocate this objects internal structures to take up the least
1444 * possible space, without changing this object's value.
1445 */
compact()1446 UnicodeSet& UnicodeSet::compact() {
1447 if (isFrozen() || isBogus()) {
1448 return *this;
1449 }
1450 // Delete buffer first to defragment memory less.
1451 if (buffer != NULL) {
1452 uprv_free(buffer);
1453 buffer = NULL;
1454 }
1455 if (len < capacity) {
1456 // Make the capacity equal to len or 1.
1457 // We don't want to realloc of 0 size.
1458 int32_t newCapacity = len + (len == 0);
1459 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1460 if (temp) {
1461 list = temp;
1462 capacity = newCapacity;
1463 }
1464 // else what the heck happened?! We allocated less memory!
1465 // Oh well. We'll keep our original array.
1466 }
1467 return *this;
1468 }
1469
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1470 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1471 int32_t bmpLength, length, destLength;
1472
1473 if (U_FAILURE(ec)) {
1474 return 0;
1475 }
1476
1477 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1478 ec=U_ILLEGAL_ARGUMENT_ERROR;
1479 return 0;
1480 }
1481
1482 /* count necessary 16-bit units */
1483 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1484 // assert(length>=0);
1485 if (length==0) {
1486 /* empty set */
1487 if (destCapacity>0) {
1488 *dest=0;
1489 } else {
1490 ec=U_BUFFER_OVERFLOW_ERROR;
1491 }
1492 return 1;
1493 }
1494 /* now length>0 */
1495
1496 if (this->list[length-1]<=0xffff) {
1497 /* all BMP */
1498 bmpLength=length;
1499 } else if (this->list[0]>=0x10000) {
1500 /* all supplementary */
1501 bmpLength=0;
1502 length*=2;
1503 } else {
1504 /* some BMP, some supplementary */
1505 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1506 length=bmpLength+2*(length-bmpLength);
1507 }
1508
1509 /* length: number of 16-bit array units */
1510 if (length>0x7fff) {
1511 /* there are only 15 bits for the length in the first serialized word */
1512 ec=U_INDEX_OUTOFBOUNDS_ERROR;
1513 return 0;
1514 }
1515
1516 /*
1517 * total serialized length:
1518 * number of 16-bit array units (length) +
1519 * 1 length unit (always) +
1520 * 1 bmpLength unit (if there are supplementary values)
1521 */
1522 destLength=length+((length>bmpLength)?2:1);
1523 if (destLength<=destCapacity) {
1524 const UChar32 *p;
1525 int32_t i;
1526
1527 *dest=(uint16_t)length;
1528 if (length>bmpLength) {
1529 *dest|=0x8000;
1530 *++dest=(uint16_t)bmpLength;
1531 }
1532 ++dest;
1533
1534 /* write the BMP part of the array */
1535 p=this->list;
1536 for (i=0; i<bmpLength; ++i) {
1537 *dest++=(uint16_t)*p++;
1538 }
1539
1540 /* write the supplementary part of the array */
1541 for (; i<length; i+=2) {
1542 *dest++=(uint16_t)(*p>>16);
1543 *dest++=(uint16_t)*p++;
1544 }
1545 } else {
1546 ec=U_BUFFER_OVERFLOW_ERROR;
1547 }
1548 return destLength;
1549 }
1550
1551 //----------------------------------------------------------------
1552 // Implementation: Utility methods
1553 //----------------------------------------------------------------
1554
1555 /**
1556 * Allocate our strings vector and return TRUE if successful.
1557 */
allocateStrings(UErrorCode & status)1558 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1559 if (U_FAILURE(status)) {
1560 return FALSE;
1561 }
1562 strings = new UVector(uhash_deleteUnicodeString,
1563 uhash_compareUnicodeString, 1, status);
1564 if (strings == NULL) { // Check for memory allocation error.
1565 status = U_MEMORY_ALLOCATION_ERROR;
1566 return FALSE;
1567 }
1568 if (U_FAILURE(status)) {
1569 delete strings;
1570 strings = NULL;
1571 return FALSE;
1572 }
1573 return TRUE;
1574 }
1575
ensureCapacity(int32_t newLen,UErrorCode & ec)1576 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1577 if (newLen <= capacity)
1578 return;
1579 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1580 if (temp == NULL) {
1581 ec = U_MEMORY_ALLOCATION_ERROR;
1582 setToBogus();
1583 return;
1584 }
1585 list = temp;
1586 capacity = newLen + GROW_EXTRA;
1587 // else we keep the original contents on the memory failure.
1588 }
1589
ensureBufferCapacity(int32_t newLen,UErrorCode & ec)1590 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1591 if (buffer != NULL && newLen <= bufferCapacity)
1592 return;
1593 UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1594 if (temp == NULL) {
1595 ec = U_MEMORY_ALLOCATION_ERROR;
1596 setToBogus();
1597 return;
1598 }
1599 buffer = temp;
1600 bufferCapacity = newLen + GROW_EXTRA;
1601 // else we keep the original contents on the memory failure.
1602 }
1603
1604 /**
1605 * Swap list and buffer.
1606 */
swapBuffers(void)1607 void UnicodeSet::swapBuffers(void) {
1608 // swap list and buffer
1609 UChar32* temp = list;
1610 list = buffer;
1611 buffer = temp;
1612
1613 int32_t c = capacity;
1614 capacity = bufferCapacity;
1615 bufferCapacity = c;
1616 }
1617
setToBogus()1618 void UnicodeSet::setToBogus() {
1619 clear(); // Remove everything in the set.
1620 fFlags = kIsBogus;
1621 }
1622
1623 //----------------------------------------------------------------
1624 // Implementation: Fundamental operators
1625 //----------------------------------------------------------------
1626
max(UChar32 a,UChar32 b)1627 static inline UChar32 max(UChar32 a, UChar32 b) {
1628 return (a > b) ? a : b;
1629 }
1630
1631 // polarity = 0, 3 is normal: x xor y
1632 // polarity = 1, 2: x xor ~y == x === y
1633
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1634 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1635 if (isFrozen() || isBogus()) {
1636 return;
1637 }
1638 UErrorCode status = U_ZERO_ERROR;
1639 ensureBufferCapacity(len + otherLen, status);
1640 if (U_FAILURE(status)) {
1641 return;
1642 }
1643
1644 int32_t i = 0, j = 0, k = 0;
1645 UChar32 a = list[i++];
1646 UChar32 b;
1647 if (polarity == 1 || polarity == 2) {
1648 b = UNICODESET_LOW;
1649 if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1650 ++j;
1651 b = other[j];
1652 }
1653 } else {
1654 b = other[j++];
1655 }
1656 // simplest of all the routines
1657 // sort the values, discarding identicals!
1658 for (;;) {
1659 if (a < b) {
1660 buffer[k++] = a;
1661 a = list[i++];
1662 } else if (b < a) {
1663 buffer[k++] = b;
1664 b = other[j++];
1665 } else if (a != UNICODESET_HIGH) { // at this point, a == b
1666 // discard both values!
1667 a = list[i++];
1668 b = other[j++];
1669 } else { // DONE!
1670 buffer[k++] = UNICODESET_HIGH;
1671 len = k;
1672 break;
1673 }
1674 }
1675 swapBuffers();
1676 releasePattern();
1677 }
1678
1679 // polarity = 0 is normal: x union y
1680 // polarity = 2: x union ~y
1681 // polarity = 1: ~x union y
1682 // polarity = 3: ~x union ~y
1683
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1684 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1685 if (isFrozen() || isBogus() || other==NULL) {
1686 return;
1687 }
1688 UErrorCode status = U_ZERO_ERROR;
1689 ensureBufferCapacity(len + otherLen, status);
1690 if (U_FAILURE(status)) {
1691 return;
1692 }
1693
1694 int32_t i = 0, j = 0, k = 0;
1695 UChar32 a = list[i++];
1696 UChar32 b = other[j++];
1697 // change from xor is that we have to check overlapping pairs
1698 // polarity bit 1 means a is second, bit 2 means b is.
1699 for (;;) {
1700 switch (polarity) {
1701 case 0: // both first; take lower if unequal
1702 if (a < b) { // take a
1703 // Back up over overlapping ranges in buffer[]
1704 if (k > 0 && a <= buffer[k-1]) {
1705 // Pick latter end value in buffer[] vs. list[]
1706 a = max(list[i], buffer[--k]);
1707 } else {
1708 // No overlap
1709 buffer[k++] = a;
1710 a = list[i];
1711 }
1712 i++; // Common if/else code factored out
1713 polarity ^= 1;
1714 } else if (b < a) { // take b
1715 if (k > 0 && b <= buffer[k-1]) {
1716 b = max(other[j], buffer[--k]);
1717 } else {
1718 buffer[k++] = b;
1719 b = other[j];
1720 }
1721 j++;
1722 polarity ^= 2;
1723 } else { // a == b, take a, drop b
1724 if (a == UNICODESET_HIGH) goto loop_end;
1725 // This is symmetrical; it doesn't matter if
1726 // we backtrack with a or b. - liu
1727 if (k > 0 && a <= buffer[k-1]) {
1728 a = max(list[i], buffer[--k]);
1729 } else {
1730 // No overlap
1731 buffer[k++] = a;
1732 a = list[i];
1733 }
1734 i++;
1735 polarity ^= 1;
1736 b = other[j++];
1737 polarity ^= 2;
1738 }
1739 break;
1740 case 3: // both second; take higher if unequal, and drop other
1741 if (b <= a) { // take a
1742 if (a == UNICODESET_HIGH) goto loop_end;
1743 buffer[k++] = a;
1744 } else { // take b
1745 if (b == UNICODESET_HIGH) goto loop_end;
1746 buffer[k++] = b;
1747 }
1748 a = list[i++];
1749 polarity ^= 1; // factored common code
1750 b = other[j++];
1751 polarity ^= 2;
1752 break;
1753 case 1: // a second, b first; if b < a, overlap
1754 if (a < b) { // no overlap, take a
1755 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1756 } else if (b < a) { // OVERLAP, drop b
1757 b = other[j++];
1758 polarity ^= 2;
1759 } else { // a == b, drop both!
1760 if (a == UNICODESET_HIGH) goto loop_end;
1761 a = list[i++];
1762 polarity ^= 1;
1763 b = other[j++];
1764 polarity ^= 2;
1765 }
1766 break;
1767 case 2: // a first, b second; if a < b, overlap
1768 if (b < a) { // no overlap, take b
1769 buffer[k++] = b;
1770 b = other[j++];
1771 polarity ^= 2;
1772 } else if (a < b) { // OVERLAP, drop a
1773 a = list[i++];
1774 polarity ^= 1;
1775 } else { // a == b, drop both!
1776 if (a == UNICODESET_HIGH) goto loop_end;
1777 a = list[i++];
1778 polarity ^= 1;
1779 b = other[j++];
1780 polarity ^= 2;
1781 }
1782 break;
1783 }
1784 }
1785 loop_end:
1786 buffer[k++] = UNICODESET_HIGH; // terminate
1787 len = k;
1788 swapBuffers();
1789 releasePattern();
1790 }
1791
1792 // polarity = 0 is normal: x intersect y
1793 // polarity = 2: x intersect ~y == set-minus
1794 // polarity = 1: ~x intersect y
1795 // polarity = 3: ~x intersect ~y
1796
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1797 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1798 if (isFrozen() || isBogus()) {
1799 return;
1800 }
1801 UErrorCode status = U_ZERO_ERROR;
1802 ensureBufferCapacity(len + otherLen, status);
1803 if (U_FAILURE(status)) {
1804 return;
1805 }
1806
1807 int32_t i = 0, j = 0, k = 0;
1808 UChar32 a = list[i++];
1809 UChar32 b = other[j++];
1810 // change from xor is that we have to check overlapping pairs
1811 // polarity bit 1 means a is second, bit 2 means b is.
1812 for (;;) {
1813 switch (polarity) {
1814 case 0: // both first; drop the smaller
1815 if (a < b) { // drop a
1816 a = list[i++];
1817 polarity ^= 1;
1818 } else if (b < a) { // drop b
1819 b = other[j++];
1820 polarity ^= 2;
1821 } else { // a == b, take one, drop other
1822 if (a == UNICODESET_HIGH) goto loop_end;
1823 buffer[k++] = a;
1824 a = list[i++];
1825 polarity ^= 1;
1826 b = other[j++];
1827 polarity ^= 2;
1828 }
1829 break;
1830 case 3: // both second; take lower if unequal
1831 if (a < b) { // take a
1832 buffer[k++] = a;
1833 a = list[i++];
1834 polarity ^= 1;
1835 } else if (b < a) { // take b
1836 buffer[k++] = b;
1837 b = other[j++];
1838 polarity ^= 2;
1839 } else { // a == b, take one, drop other
1840 if (a == UNICODESET_HIGH) goto loop_end;
1841 buffer[k++] = a;
1842 a = list[i++];
1843 polarity ^= 1;
1844 b = other[j++];
1845 polarity ^= 2;
1846 }
1847 break;
1848 case 1: // a second, b first;
1849 if (a < b) { // NO OVERLAP, drop a
1850 a = list[i++];
1851 polarity ^= 1;
1852 } else if (b < a) { // OVERLAP, take b
1853 buffer[k++] = b;
1854 b = other[j++];
1855 polarity ^= 2;
1856 } else { // a == b, drop both!
1857 if (a == UNICODESET_HIGH) goto loop_end;
1858 a = list[i++];
1859 polarity ^= 1;
1860 b = other[j++];
1861 polarity ^= 2;
1862 }
1863 break;
1864 case 2: // a first, b second; if a < b, overlap
1865 if (b < a) { // no overlap, drop b
1866 b = other[j++];
1867 polarity ^= 2;
1868 } else if (a < b) { // OVERLAP, take a
1869 buffer[k++] = a;
1870 a = list[i++];
1871 polarity ^= 1;
1872 } else { // a == b, drop both!
1873 if (a == UNICODESET_HIGH) goto loop_end;
1874 a = list[i++];
1875 polarity ^= 1;
1876 b = other[j++];
1877 polarity ^= 2;
1878 }
1879 break;
1880 }
1881 }
1882 loop_end:
1883 buffer[k++] = UNICODESET_HIGH; // terminate
1884 len = k;
1885 swapBuffers();
1886 releasePattern();
1887 }
1888
1889 /**
1890 * Append the <code>toPattern()</code> representation of a
1891 * string to the given <code>StringBuffer</code>.
1892 */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1893 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1894 escapeUnprintable) {
1895 UChar32 cp;
1896 for (int32_t i = 0; i < s.length(); i += UTF_CHAR_LENGTH(cp)) {
1897 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1898 }
1899 }
1900
1901 /**
1902 * Append the <code>toPattern()</code> representation of a
1903 * character to the given <code>StringBuffer</code>.
1904 */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1905 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1906 escapeUnprintable) {
1907 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1908 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1909 // unprintable
1910 if (ICU_Utility::escapeUnprintable(buf, c)) {
1911 return;
1912 }
1913 }
1914 // Okay to let ':' pass through
1915 switch (c) {
1916 case SET_OPEN:
1917 case SET_CLOSE:
1918 case HYPHEN:
1919 case COMPLEMENT:
1920 case INTERSECTION:
1921 case BACKSLASH:
1922 case OPEN_BRACE:
1923 case CLOSE_BRACE:
1924 case COLON:
1925 case SymbolTable::SYMBOL_REF:
1926 buf.append(BACKSLASH);
1927 break;
1928 default:
1929 // Escape whitespace
1930 if (PatternProps::isWhiteSpace(c)) {
1931 buf.append(BACKSLASH);
1932 }
1933 break;
1934 }
1935 buf.append(c);
1936 }
1937
1938 /**
1939 * Append a string representation of this set to result. This will be
1940 * a cleaned version of the string passed to applyPattern(), if there
1941 * is one. Otherwise it will be generated.
1942 */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const1943 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
1944 UBool escapeUnprintable) const
1945 {
1946 if (pat != NULL) {
1947 int32_t i;
1948 int32_t backslashCount = 0;
1949 for (i=0; i<patLen; ) {
1950 UChar32 c;
1951 U16_NEXT(pat, i, patLen, c);
1952 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1953 // If the unprintable character is preceded by an odd
1954 // number of backslashes, then it has been escaped.
1955 // Before unescaping it, we delete the final
1956 // backslash.
1957 if ((backslashCount % 2) == 1) {
1958 result.truncate(result.length() - 1);
1959 }
1960 ICU_Utility::escapeUnprintable(result, c);
1961 backslashCount = 0;
1962 } else {
1963 result.append(c);
1964 if (c == BACKSLASH) {
1965 ++backslashCount;
1966 } else {
1967 backslashCount = 0;
1968 }
1969 }
1970 }
1971 return result;
1972 }
1973
1974 return _generatePattern(result, escapeUnprintable);
1975 }
1976
1977 /**
1978 * Returns a string representation of this set. If the result of
1979 * calling this function is passed to a UnicodeSet constructor, it
1980 * will produce another set that is equal to this one.
1981 */
toPattern(UnicodeString & result,UBool escapeUnprintable) const1982 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
1983 UBool escapeUnprintable) const
1984 {
1985 result.truncate(0);
1986 return _toPattern(result, escapeUnprintable);
1987 }
1988
1989 /**
1990 * Generate and append a string representation of this set to result.
1991 * This does not use this.pat, the cleaned up copy of the string
1992 * passed to applyPattern().
1993 */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const1994 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
1995 UBool escapeUnprintable) const
1996 {
1997 result.append(SET_OPEN);
1998
1999 // // Check against the predefined categories. We implicitly build
2000 // // up ALL category sets the first time toPattern() is called.
2001 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2002 // if (*this == getCategorySet(cat)) {
2003 // result.append(COLON);
2004 // result.append(CATEGORY_NAMES, cat*2, 2);
2005 // return result.append(CATEGORY_CLOSE);
2006 // }
2007 // }
2008
2009 int32_t count = getRangeCount();
2010
2011 // If the set contains at least 2 intervals and includes both
2012 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2013 // be more economical.
2014 if (count > 1 &&
2015 getRangeStart(0) == MIN_VALUE &&
2016 getRangeEnd(count-1) == MAX_VALUE) {
2017
2018 // Emit the inverse
2019 result.append(COMPLEMENT);
2020
2021 for (int32_t i = 1; i < count; ++i) {
2022 UChar32 start = getRangeEnd(i-1)+1;
2023 UChar32 end = getRangeStart(i)-1;
2024 _appendToPat(result, start, escapeUnprintable);
2025 if (start != end) {
2026 if ((start+1) != end) {
2027 result.append(HYPHEN);
2028 }
2029 _appendToPat(result, end, escapeUnprintable);
2030 }
2031 }
2032 }
2033
2034 // Default; emit the ranges as pairs
2035 else {
2036 for (int32_t i = 0; i < count; ++i) {
2037 UChar32 start = getRangeStart(i);
2038 UChar32 end = getRangeEnd(i);
2039 _appendToPat(result, start, escapeUnprintable);
2040 if (start != end) {
2041 if ((start+1) != end) {
2042 result.append(HYPHEN);
2043 }
2044 _appendToPat(result, end, escapeUnprintable);
2045 }
2046 }
2047 }
2048
2049 for (int32_t i = 0; i<strings->size(); ++i) {
2050 result.append(OPEN_BRACE);
2051 _appendToPat(result,
2052 *(const UnicodeString*) strings->elementAt(i),
2053 escapeUnprintable);
2054 result.append(CLOSE_BRACE);
2055 }
2056 return result.append(SET_CLOSE);
2057 }
2058
2059 /**
2060 * Release existing cached pattern
2061 */
releasePattern()2062 void UnicodeSet::releasePattern() {
2063 if (pat) {
2064 uprv_free(pat);
2065 pat = NULL;
2066 patLen = 0;
2067 }
2068 }
2069
2070 /**
2071 * Set the new pattern to cache.
2072 */
setPattern(const UnicodeString & newPat)2073 void UnicodeSet::setPattern(const UnicodeString& newPat) {
2074 releasePattern();
2075 int32_t newPatLen = newPat.length();
2076 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2077 if (pat) {
2078 patLen = newPatLen;
2079 newPat.extractBetween(0, patLen, pat);
2080 pat[patLen] = 0;
2081 }
2082 // else we don't care if malloc failed. This was just a nice cache.
2083 // We can regenerate an equivalent pattern later when requested.
2084 }
2085
freeze()2086 UnicodeFunctor *UnicodeSet::freeze() {
2087 if(!isFrozen() && !isBogus()) {
2088 // Do most of what compact() does before freezing because
2089 // compact() will not work when the set is frozen.
2090 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2091
2092 // Delete buffer first to defragment memory less.
2093 if (buffer != NULL) {
2094 uprv_free(buffer);
2095 buffer = NULL;
2096 }
2097 if (capacity > (len + GROW_EXTRA)) {
2098 // Make the capacity equal to len or 1.
2099 // We don't want to realloc of 0 size.
2100 capacity = len + (len == 0);
2101 list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2102 if (list == NULL) { // Check for memory allocation error.
2103 setToBogus();
2104 return this;
2105 }
2106 }
2107
2108 // Optimize contains() and span() and similar functions.
2109 if (!strings->isEmpty()) {
2110 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2111 if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2112 // All strings are irrelevant for span() etc. because
2113 // all of each string's code points are contained in this set.
2114 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2115 // many relevant strings as UTF-16.
2116 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2117 delete stringSpan;
2118 stringSpan = NULL;
2119 }
2120 }
2121 if (stringSpan == NULL) {
2122 // No span-relevant strings: Optimize for code point spans.
2123 bmpSet=new BMPSet(list, len);
2124 if (bmpSet == NULL) { // Check for memory allocation error.
2125 setToBogus();
2126 }
2127 }
2128 }
2129 return this;
2130 }
2131
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2132 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2133 if(length>0 && bmpSet!=NULL) {
2134 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2135 }
2136 if(length<0) {
2137 length=u_strlen(s);
2138 }
2139 if(length==0) {
2140 return 0;
2141 }
2142 if(stringSpan!=NULL) {
2143 return stringSpan->span(s, length, spanCondition);
2144 } else if(!strings->isEmpty()) {
2145 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2146 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2147 UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2148 UnicodeSetStringSpan strSpan(*this, *strings, which);
2149 if(strSpan.needsStringSpanUTF16()) {
2150 return strSpan.span(s, length, spanCondition);
2151 }
2152 }
2153
2154 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2155 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2156 }
2157
2158 UChar32 c;
2159 int32_t start=0, prev=0;
2160 do {
2161 U16_NEXT(s, start, length, c);
2162 if(spanCondition!=contains(c)) {
2163 break;
2164 }
2165 } while((prev=start)<length);
2166 return prev;
2167 }
2168
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2169 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2170 if(length>0 && bmpSet!=NULL) {
2171 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2172 }
2173 if(length<0) {
2174 length=u_strlen(s);
2175 }
2176 if(length==0) {
2177 return 0;
2178 }
2179 if(stringSpan!=NULL) {
2180 return stringSpan->spanBack(s, length, spanCondition);
2181 } else if(!strings->isEmpty()) {
2182 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2183 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2184 UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2185 UnicodeSetStringSpan strSpan(*this, *strings, which);
2186 if(strSpan.needsStringSpanUTF16()) {
2187 return strSpan.spanBack(s, length, spanCondition);
2188 }
2189 }
2190
2191 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2192 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2193 }
2194
2195 UChar32 c;
2196 int32_t prev=length;
2197 do {
2198 U16_PREV(s, 0, length, c);
2199 if(spanCondition!=contains(c)) {
2200 break;
2201 }
2202 } while((prev=length)>0);
2203 return prev;
2204 }
2205
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2206 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2207 if(length>0 && bmpSet!=NULL) {
2208 const uint8_t *s0=(const uint8_t *)s;
2209 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2210 }
2211 if(length<0) {
2212 length=(int32_t)uprv_strlen(s);
2213 }
2214 if(length==0) {
2215 return 0;
2216 }
2217 if(stringSpan!=NULL) {
2218 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2219 } else if(!strings->isEmpty()) {
2220 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2221 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2222 UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2223 UnicodeSetStringSpan strSpan(*this, *strings, which);
2224 if(strSpan.needsStringSpanUTF8()) {
2225 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2226 }
2227 }
2228
2229 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2230 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2231 }
2232
2233 UChar32 c;
2234 int32_t start=0, prev=0;
2235 do {
2236 U8_NEXT(s, start, length, c);
2237 if(c<0) {
2238 c=0xfffd;
2239 }
2240 if(spanCondition!=contains(c)) {
2241 break;
2242 }
2243 } while((prev=start)<length);
2244 return prev;
2245 }
2246
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2247 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2248 if(length>0 && bmpSet!=NULL) {
2249 const uint8_t *s0=(const uint8_t *)s;
2250 return bmpSet->spanBackUTF8(s0, length, spanCondition);
2251 }
2252 if(length<0) {
2253 length=(int32_t)uprv_strlen(s);
2254 }
2255 if(length==0) {
2256 return 0;
2257 }
2258 if(stringSpan!=NULL) {
2259 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2260 } else if(!strings->isEmpty()) {
2261 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2262 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2263 UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2264 UnicodeSetStringSpan strSpan(*this, *strings, which);
2265 if(strSpan.needsStringSpanUTF8()) {
2266 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2267 }
2268 }
2269
2270 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2271 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2272 }
2273
2274 UChar32 c;
2275 int32_t prev=length;
2276 do {
2277 U8_PREV(s, 0, length, c);
2278 if(c<0) {
2279 c=0xfffd;
2280 }
2281 if(spanCondition!=contains(c)) {
2282 break;
2283 }
2284 } while((prev=length)>0);
2285 return prev;
2286 }
2287
2288 U_NAMESPACE_END
2289