1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12 #include "unicode/parsepos.h"
13 #include "unicode/symtable.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utf8.h"
16 #include "unicode/utf16.h"
17 #include "ruleiter.h"
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "patternprops.h"
21 #include "uelement.h"
22 #include "util.h"
23 #include "uvector.h"
24 #include "charstr.h"
25 #include "ustrfmt.h"
26 #include "uassert.h"
27 #include "bmpset.h"
28 #include "unisetspan.h"
29
30 // Define UChar constants using hex for EBCDIC compatibility
31 // Used #define to reduce private static exports and memory access time.
32 #define SET_OPEN ((UChar)0x005B) /*[*/
33 #define SET_CLOSE ((UChar)0x005D) /*]*/
34 #define HYPHEN ((UChar)0x002D) /*-*/
35 #define COMPLEMENT ((UChar)0x005E) /*^*/
36 #define COLON ((UChar)0x003A) /*:*/
37 #define BACKSLASH ((UChar)0x005C) /*\*/
38 #define INTERSECTION ((UChar)0x0026) /*&*/
39 #define UPPER_U ((UChar)0x0055) /*U*/
40 #define LOWER_U ((UChar)0x0075) /*u*/
41 #define OPEN_BRACE ((UChar)123) /*{*/
42 #define CLOSE_BRACE ((UChar)125) /*}*/
43 #define UPPER_P ((UChar)0x0050) /*P*/
44 #define LOWER_P ((UChar)0x0070) /*p*/
45 #define UPPER_N ((UChar)78) /*N*/
46 #define EQUALS ((UChar)0x003D) /*=*/
47
48 // HIGH_VALUE > all valid values. 110000 for codepoints
49 #define UNICODESET_HIGH 0x0110000
50
51 // LOW <= all valid values. ZERO for codepoints
52 #define UNICODESET_LOW 0x000000
53
54 // initial storage. Must be >= 0
55 #define START_EXTRA 16
56
57 // extra amount for growth. Must be >= 0
58 #define GROW_EXTRA START_EXTRA
59
60 U_NAMESPACE_BEGIN
61
~SymbolTable()62 SymbolTable::~SymbolTable() {}
63
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
65
66 /**
67 * Modify the given UChar32 variable so that it is in range, by
68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
70 * It modifies its argument in-place and also returns it.
71 */
72 static inline UChar32 pinCodePoint(UChar32& c) {
73 if (c < UNICODESET_LOW) {
74 c = UNICODESET_LOW;
75 } else if (c > (UNICODESET_HIGH-1)) {
76 c = (UNICODESET_HIGH-1);
77 }
78 return c;
79 }
80
81 //----------------------------------------------------------------
82 // Debugging
83 //----------------------------------------------------------------
84
85 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
86 // To enable the debugging, define the symbol DEBUG_MEM in the line
87 // below. This will result in text being sent to stdout that looks
88 // like this:
89 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
90 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
91 // Each line lists a construction (ct) or destruction (dt) event, the
92 // object address, the number of outstanding objects after the event,
93 // and the pattern of the object in question.
94
95 // #define DEBUG_MEM
96
97 #ifdef DEBUG_MEM
98 #include <stdio.h>
99 static int32_t _dbgCount = 0;
100
_dbgct(UnicodeSet * set)101 static inline void _dbgct(UnicodeSet* set) {
102 UnicodeString str;
103 set->toPattern(str, TRUE);
104 char buf[40];
105 str.extract(0, 39, buf, "");
106 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
107 }
108
_dbgdt(UnicodeSet * set)109 static inline void _dbgdt(UnicodeSet* set) {
110 UnicodeString str;
111 set->toPattern(str, TRUE);
112 char buf[40];
113 str.extract(0, 39, buf, "");
114 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
115 }
116
117 #else
118
119 #define _dbgct(set)
120 #define _dbgdt(set)
121
122 #endif
123
124 //----------------------------------------------------------------
125 // UnicodeString in UVector support
126 //----------------------------------------------------------------
127
cloneUnicodeString(UElement * dst,UElement * src)128 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
129 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
130 }
131
compareUnicodeString(UElement t1,UElement t2)132 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
133 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
134 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
135 return a.compare(b);
136 }
137
138 //----------------------------------------------------------------
139 // Constructors &c
140 //----------------------------------------------------------------
141
142 /**
143 * Constructs an empty set.
144 */
UnicodeSet()145 UnicodeSet::UnicodeSet() :
146 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
147 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
148 fFlags(0)
149 {
150 UErrorCode status = U_ZERO_ERROR;
151 allocateStrings(status);
152 if (U_FAILURE(status)) {
153 return;
154 }
155 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
156 if(list!=NULL){
157 list[0] = UNICODESET_HIGH;
158 } else { // If memory allocation failed, set to bogus state.
159 setToBogus();
160 return;
161 }
162 _dbgct(this);
163 }
164
165 /**
166 * Constructs a set containing the given range. If <code>end >
167 * start</code> then an empty set is created.
168 *
169 * @param start first character, inclusive, of range
170 * @param end last character, inclusive, of range
171 */
UnicodeSet(UChar32 start,UChar32 end)172 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
173 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
174 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
175 fFlags(0)
176 {
177 UErrorCode status = U_ZERO_ERROR;
178 allocateStrings(status);
179 if (U_FAILURE(status)) {
180 return;
181 }
182 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
183 if(list!=NULL){
184 list[0] = UNICODESET_HIGH;
185 complement(start, end);
186 } else { // If memory allocation failed, set to bogus state.
187 setToBogus();
188 return;
189 }
190 _dbgct(this);
191 }
192
193 /**
194 * Constructs a set that is identical to the given UnicodeSet.
195 */
UnicodeSet(const UnicodeSet & o)196 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
197 UnicodeFilter(o),
198 len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
199 bmpSet(0),
200 buffer(0), bufferCapacity(0),
201 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
202 fFlags(0)
203 {
204 UErrorCode status = U_ZERO_ERROR;
205 allocateStrings(status);
206 if (U_FAILURE(status)) {
207 return;
208 }
209 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
210 if(list!=NULL){
211 *this = o;
212 } else { // If memory allocation failed, set to bogus state.
213 setToBogus();
214 return;
215 }
216 _dbgct(this);
217 }
218
219 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)220 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
221 UnicodeFilter(o),
222 len(0), capacity(o.len + GROW_EXTRA), list(0),
223 bmpSet(0),
224 buffer(0), bufferCapacity(0),
225 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
226 fFlags(0)
227 {
228 UErrorCode status = U_ZERO_ERROR;
229 allocateStrings(status);
230 if (U_FAILURE(status)) {
231 return;
232 }
233 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
234 if(list!=NULL){
235 // *this = o except for bmpSet and stringSpan
236 len = o.len;
237 uprv_memcpy(list, o.list, len*sizeof(UChar32));
238 if (strings != NULL && o.strings != NULL) {
239 strings->assign(*o.strings, cloneUnicodeString, status);
240 } else { // Invalid strings.
241 setToBogus();
242 return;
243 }
244 if (o.pat) {
245 setPattern(UnicodeString(o.pat, o.patLen));
246 }
247 } else { // If memory allocation failed, set to bogus state.
248 setToBogus();
249 return;
250 }
251 _dbgct(this);
252 }
253
254 /**
255 * Destructs the set.
256 */
~UnicodeSet()257 UnicodeSet::~UnicodeSet() {
258 _dbgdt(this); // first!
259 uprv_free(list);
260 delete bmpSet;
261 if (buffer) {
262 uprv_free(buffer);
263 }
264 delete strings;
265 delete stringSpan;
266 releasePattern();
267 }
268
269 /**
270 * Assigns this object to be a copy of another.
271 */
operator =(const UnicodeSet & o)272 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
273 if (this == &o) {
274 return *this;
275 }
276 if (isFrozen()) {
277 return *this;
278 }
279 if (o.isBogus()) {
280 setToBogus();
281 return *this;
282 }
283 UErrorCode ec = U_ZERO_ERROR;
284 ensureCapacity(o.len, ec);
285 if (U_FAILURE(ec)) {
286 return *this; // There is no way to report this error :-(
287 }
288 len = o.len;
289 uprv_memcpy(list, o.list, len*sizeof(UChar32));
290 if (o.bmpSet == NULL) {
291 bmpSet = NULL;
292 } else {
293 bmpSet = new BMPSet(*o.bmpSet, list, len);
294 if (bmpSet == NULL) { // Check for memory allocation error.
295 setToBogus();
296 return *this;
297 }
298 }
299 if (strings != NULL && o.strings != NULL) {
300 strings->assign(*o.strings, cloneUnicodeString, ec);
301 } else { // Invalid strings.
302 setToBogus();
303 return *this;
304 }
305 if (o.stringSpan == NULL) {
306 stringSpan = NULL;
307 } else {
308 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
309 if (stringSpan == NULL) { // Check for memory allocation error.
310 setToBogus();
311 return *this;
312 }
313 }
314 releasePattern();
315 if (o.pat) {
316 setPattern(UnicodeString(o.pat, o.patLen));
317 }
318 return *this;
319 }
320
321 /**
322 * Returns a copy of this object. All UnicodeMatcher objects have
323 * to support cloning in order to allow classes using
324 * UnicodeMatchers, such as Transliterator, to implement cloning.
325 */
clone() const326 UnicodeFunctor* UnicodeSet::clone() const {
327 return new UnicodeSet(*this);
328 }
329
cloneAsThawed() const330 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
331 return new UnicodeSet(*this, TRUE);
332 }
333
334 /**
335 * Compares the specified object with this set for equality. Returns
336 * <tt>true</tt> if the two sets
337 * have the same size, and every member of the specified set is
338 * contained in this set (or equivalently, every member of this set is
339 * contained in the specified set).
340 *
341 * @param o set to be compared for equality with this set.
342 * @return <tt>true</tt> if the specified set is equal to this set.
343 */
operator ==(const UnicodeSet & o) const344 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
345 if (len != o.len) return FALSE;
346 for (int32_t i = 0; i < len; ++i) {
347 if (list[i] != o.list[i]) return FALSE;
348 }
349 if (*strings != *o.strings) return FALSE;
350 return TRUE;
351 }
352
353 /**
354 * Returns the hash code value for this set.
355 *
356 * @return the hash code value for this set.
357 * @see Object#hashCode()
358 */
hashCode(void) const359 int32_t UnicodeSet::hashCode(void) const {
360 int32_t result = len;
361 for (int32_t i = 0; i < len; ++i) {
362 result *= 1000003;
363 result += list[i];
364 }
365 return result;
366 }
367
368 //----------------------------------------------------------------
369 // Public API
370 //----------------------------------------------------------------
371
372 /**
373 * Returns the number of elements in this set (its cardinality),
374 * Note than the elements of a set may include both individual
375 * codepoints and strings.
376 *
377 * @return the number of elements in this set (its cardinality).
378 */
size(void) const379 int32_t UnicodeSet::size(void) const {
380 int32_t n = 0;
381 int32_t count = getRangeCount();
382 for (int32_t i = 0; i < count; ++i) {
383 n += getRangeEnd(i) - getRangeStart(i) + 1;
384 }
385 return n + strings->size();
386 }
387
388 /**
389 * Returns <tt>true</tt> if this set contains no elements.
390 *
391 * @return <tt>true</tt> if this set contains no elements.
392 */
isEmpty(void) const393 UBool UnicodeSet::isEmpty(void) const {
394 return len == 1 && strings->size() == 0;
395 }
396
397 /**
398 * Returns true if this set contains the given character.
399 * @param c character to be checked for containment
400 * @return true if the test condition is met
401 */
contains(UChar32 c) const402 UBool UnicodeSet::contains(UChar32 c) const {
403 // Set i to the index of the start item greater than ch
404 // We know we will terminate without length test!
405 // LATER: for large sets, add binary search
406 //int32_t i = -1;
407 //for (;;) {
408 // if (c < list[++i]) break;
409 //}
410 if (bmpSet != NULL) {
411 return bmpSet->contains(c);
412 }
413 if (stringSpan != NULL) {
414 return stringSpan->contains(c);
415 }
416 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
417 return FALSE;
418 }
419 int32_t i = findCodePoint(c);
420 return (UBool)(i & 1); // return true if odd
421 }
422
423 /**
424 * Returns the smallest value i such that c < list[i]. Caller
425 * must ensure that c is a legal value or this method will enter
426 * an infinite loop. This method performs a binary search.
427 * @param c a character in the range MIN_VALUE..MAX_VALUE
428 * inclusive
429 * @return the smallest integer i in the range 0..len-1,
430 * inclusive, such that c < list[i]
431 */
findCodePoint(UChar32 c) const432 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
433 /* Examples:
434 findCodePoint(c)
435 set list[] c=0 1 3 4 7 8
436 === ============== ===========
437 [] [110000] 0 0 0 0 0 0
438 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
439 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
440 [:Any:] [0, 110000] 1 1 1 1 1 1
441 */
442
443 // Return the smallest i such that c < list[i]. Assume
444 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
445 if (c < list[0])
446 return 0;
447 // High runner test. c is often after the last range, so an
448 // initial check for this condition pays off.
449 int32_t lo = 0;
450 int32_t hi = len - 1;
451 if (lo >= hi || c >= list[hi-1])
452 return hi;
453 // invariant: c >= list[lo]
454 // invariant: c < list[hi]
455 for (;;) {
456 int32_t i = (lo + hi) >> 1;
457 if (i == lo) {
458 break; // Found!
459 } else if (c < list[i]) {
460 hi = i;
461 } else {
462 lo = i;
463 }
464 }
465 return hi;
466 }
467
468 /**
469 * Returns true if this set contains every character
470 * of the given range.
471 * @param start first character, inclusive, of the range
472 * @param end last character, inclusive, of the range
473 * @return true if the test condition is met
474 */
contains(UChar32 start,UChar32 end) const475 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
476 //int32_t i = -1;
477 //for (;;) {
478 // if (start < list[++i]) break;
479 //}
480 int32_t i = findCodePoint(start);
481 return ((i & 1) != 0 && end < list[i]);
482 }
483
484 /**
485 * Returns <tt>true</tt> if this set contains the given
486 * multicharacter string.
487 * @param s string to be checked for containment
488 * @return <tt>true</tt> if this set contains the specified string
489 */
contains(const UnicodeString & s) const490 UBool UnicodeSet::contains(const UnicodeString& s) const {
491 if (s.length() == 0) return FALSE;
492 int32_t cp = getSingleCP(s);
493 if (cp < 0) {
494 return strings->contains((void*) &s);
495 } else {
496 return contains((UChar32) cp);
497 }
498 }
499
500 /**
501 * Returns true if this set contains all the characters and strings
502 * of the given set.
503 * @param c set to be checked for containment
504 * @return true if the test condition is met
505 */
containsAll(const UnicodeSet & c) const506 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
507 // The specified set is a subset if all of its pairs are contained in
508 // this set. It's possible to code this more efficiently in terms of
509 // direct manipulation of the inversion lists if the need arises.
510 int32_t n = c.getRangeCount();
511 for (int i=0; i<n; ++i) {
512 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
513 return FALSE;
514 }
515 }
516 if (!strings->containsAll(*c.strings)) return FALSE;
517 return TRUE;
518 }
519
520 /**
521 * Returns true if this set contains all the characters
522 * of the given string.
523 * @param s string containing characters to be checked for containment
524 * @return true if the test condition is met
525 */
containsAll(const UnicodeString & s) const526 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
527 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
528 s.length());
529 }
530
531 /**
532 * Returns true if this set contains none of the characters
533 * of the given range.
534 * @param start first character, inclusive, of the range
535 * @param end last character, inclusive, of the range
536 * @return true if the test condition is met
537 */
containsNone(UChar32 start,UChar32 end) const538 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
539 //int32_t i = -1;
540 //for (;;) {
541 // if (start < list[++i]) break;
542 //}
543 int32_t i = findCodePoint(start);
544 return ((i & 1) == 0 && end < list[i]);
545 }
546
547 /**
548 * Returns true if this set contains none of the characters and strings
549 * of the given set.
550 * @param c set to be checked for containment
551 * @return true if the test condition is met
552 */
containsNone(const UnicodeSet & c) const553 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
554 // The specified set is a subset if all of its pairs are contained in
555 // this set. It's possible to code this more efficiently in terms of
556 // direct manipulation of the inversion lists if the need arises.
557 int32_t n = c.getRangeCount();
558 for (int32_t i=0; i<n; ++i) {
559 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
560 return FALSE;
561 }
562 }
563 if (!strings->containsNone(*c.strings)) return FALSE;
564 return TRUE;
565 }
566
567 /**
568 * Returns true if this set contains none of the characters
569 * of the given string.
570 * @param s string containing characters to be checked for containment
571 * @return true if the test condition is met
572 */
containsNone(const UnicodeString & s) const573 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
574 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
575 s.length());
576 }
577
578 /**
579 * Returns <tt>true</tt> if this set contains any character whose low byte
580 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
581 * indexing.
582 */
matchesIndexValue(uint8_t v) const583 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
584 /* The index value v, in the range [0,255], is contained in this set if
585 * it is contained in any pair of this set. Pairs either have the high
586 * bytes equal, or unequal. If the high bytes are equal, then we have
587 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
588 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
589 * Then v is contained if xx <= v || v <= yy. (This is identical to the
590 * time zone month containment logic.)
591 */
592 int32_t i;
593 int32_t rangeCount=getRangeCount();
594 for (i=0; i<rangeCount; ++i) {
595 UChar32 low = getRangeStart(i);
596 UChar32 high = getRangeEnd(i);
597 if ((low & ~0xFF) == (high & ~0xFF)) {
598 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
599 return TRUE;
600 }
601 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
602 return TRUE;
603 }
604 }
605 if (strings->size() != 0) {
606 for (i=0; i<strings->size(); ++i) {
607 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
608 //if (s.length() == 0) {
609 // // Empty strings match everything
610 // return TRUE;
611 //}
612 // assert(s.length() != 0); // We enforce this elsewhere
613 UChar32 c = s.char32At(0);
614 if ((c & 0xFF) == v) {
615 return TRUE;
616 }
617 }
618 }
619 return FALSE;
620 }
621
622 /**
623 * Implementation of UnicodeMatcher::matches(). Always matches the
624 * longest possible multichar string.
625 */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)626 UMatchDegree UnicodeSet::matches(const Replaceable& text,
627 int32_t& offset,
628 int32_t limit,
629 UBool incremental) {
630 if (offset == limit) {
631 // Strings, if any, have length != 0, so we don't worry
632 // about them here. If we ever allow zero-length strings
633 // we much check for them here.
634 if (contains(U_ETHER)) {
635 return incremental ? U_PARTIAL_MATCH : U_MATCH;
636 } else {
637 return U_MISMATCH;
638 }
639 } else {
640 if (strings->size() != 0) { // try strings first
641
642 // might separate forward and backward loops later
643 // for now they are combined
644
645 // TODO Improve efficiency of this, at least in the forward
646 // direction, if not in both. In the forward direction we
647 // can assume the strings are sorted.
648
649 int32_t i;
650 UBool forward = offset < limit;
651
652 // firstChar is the leftmost char to match in the
653 // forward direction or the rightmost char to match in
654 // the reverse direction.
655 UChar firstChar = text.charAt(offset);
656
657 // If there are multiple strings that can match we
658 // return the longest match.
659 int32_t highWaterLength = 0;
660
661 for (i=0; i<strings->size(); ++i) {
662 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
663
664 //if (trial.length() == 0) {
665 // return U_MATCH; // null-string always matches
666 //}
667 // assert(trial.length() != 0); // We ensure this elsewhere
668
669 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
670
671 // Strings are sorted, so we can optimize in the
672 // forward direction.
673 if (forward && c > firstChar) break;
674 if (c != firstChar) continue;
675
676 int32_t matchLen = matchRest(text, offset, limit, trial);
677
678 if (incremental) {
679 int32_t maxLen = forward ? limit-offset : offset-limit;
680 if (matchLen == maxLen) {
681 // We have successfully matched but only up to limit.
682 return U_PARTIAL_MATCH;
683 }
684 }
685
686 if (matchLen == trial.length()) {
687 // We have successfully matched the whole string.
688 if (matchLen > highWaterLength) {
689 highWaterLength = matchLen;
690 }
691 // In the forward direction we know strings
692 // are sorted so we can bail early.
693 if (forward && matchLen < highWaterLength) {
694 break;
695 }
696 continue;
697 }
698 }
699
700 // We've checked all strings without a partial match.
701 // If we have full matches, return the longest one.
702 if (highWaterLength != 0) {
703 offset += forward ? highWaterLength : -highWaterLength;
704 return U_MATCH;
705 }
706 }
707 return UnicodeFilter::matches(text, offset, limit, incremental);
708 }
709 }
710
711 /**
712 * Returns the longest match for s in text at the given position.
713 * If limit > start then match forward from start+1 to limit
714 * matching all characters except s.charAt(0). If limit < start,
715 * go backward starting from start-1 matching all characters
716 * except s.charAt(s.length()-1). This method assumes that the
717 * first character, text.charAt(start), matches s, so it does not
718 * check it.
719 * @param text the text to match
720 * @param start the first character to match. In the forward
721 * direction, text.charAt(start) is matched against s.charAt(0).
722 * In the reverse direction, it is matched against
723 * s.charAt(s.length()-1).
724 * @param limit the limit offset for matching, either last+1 in
725 * the forward direction, or last-1 in the reverse direction,
726 * where last is the index of the last character to match.
727 * @return If part of s matches up to the limit, return |limit -
728 * start|. If all of s matches before reaching the limit, return
729 * s.length(). If there is a mismatch between s and text, return
730 * 0
731 */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)732 int32_t UnicodeSet::matchRest(const Replaceable& text,
733 int32_t start, int32_t limit,
734 const UnicodeString& s) {
735 int32_t i;
736 int32_t maxLen;
737 int32_t slen = s.length();
738 if (start < limit) {
739 maxLen = limit - start;
740 if (maxLen > slen) maxLen = slen;
741 for (i = 1; i < maxLen; ++i) {
742 if (text.charAt(start + i) != s.charAt(i)) return 0;
743 }
744 } else {
745 maxLen = start - limit;
746 if (maxLen > slen) maxLen = slen;
747 --slen; // <=> slen = s.length() - 1;
748 for (i = 1; i < maxLen; ++i) {
749 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
750 }
751 }
752 return maxLen;
753 }
754
755 /**
756 * Implement of UnicodeMatcher
757 */
addMatchSetTo(UnicodeSet & toUnionTo) const758 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
759 toUnionTo.addAll(*this);
760 }
761
762 /**
763 * Returns the index of the given character within this set, where
764 * the set is ordered by ascending code point. If the character
765 * is not in this set, return -1. The inverse of this method is
766 * <code>charAt()</code>.
767 * @return an index from 0..size()-1, or -1
768 */
indexOf(UChar32 c) const769 int32_t UnicodeSet::indexOf(UChar32 c) const {
770 if (c < MIN_VALUE || c > MAX_VALUE) {
771 return -1;
772 }
773 int32_t i = 0;
774 int32_t n = 0;
775 for (;;) {
776 UChar32 start = list[i++];
777 if (c < start) {
778 return -1;
779 }
780 UChar32 limit = list[i++];
781 if (c < limit) {
782 return n + c - start;
783 }
784 n += limit - start;
785 }
786 }
787
788 /**
789 * Returns the character at the given index within this set, where
790 * the set is ordered by ascending code point. If the index is
791 * out of range, return (UChar32)-1. The inverse of this method is
792 * <code>indexOf()</code>.
793 * @param index an index from 0..size()-1
794 * @return the character at the given index, or (UChar32)-1.
795 */
charAt(int32_t index) const796 UChar32 UnicodeSet::charAt(int32_t index) const {
797 if (index >= 0) {
798 // len2 is the largest even integer <= len, that is, it is len
799 // for even values and len-1 for odd values. With odd values
800 // the last entry is UNICODESET_HIGH.
801 int32_t len2 = len & ~1;
802 for (int32_t i=0; i < len2;) {
803 UChar32 start = list[i++];
804 int32_t count = list[i++] - start;
805 if (index < count) {
806 return (UChar32)(start + index);
807 }
808 index -= count;
809 }
810 }
811 return (UChar32)-1;
812 }
813
814 /**
815 * Make this object represent the range <code>start - end</code>.
816 * If <code>end > start</code> then this object is set to an
817 * an empty range.
818 *
819 * @param start first character in the set, inclusive
820 * @rparam end last character in the set, inclusive
821 */
set(UChar32 start,UChar32 end)822 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
823 clear();
824 complement(start, end);
825 return *this;
826 }
827
828 /**
829 * Adds the specified range to this set if it is not already
830 * present. If this set already contains the specified range,
831 * the call leaves this set unchanged. If <code>end > start</code>
832 * then an empty range is added, leaving the set unchanged.
833 *
834 * @param start first character, inclusive, of range to be added
835 * to this set.
836 * @param end last character, inclusive, of range to be added
837 * to this set.
838 */
add(UChar32 start,UChar32 end)839 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
840 if (pinCodePoint(start) < pinCodePoint(end)) {
841 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
842 add(range, 2, 0);
843 } else if (start == end) {
844 add(start);
845 }
846 return *this;
847 }
848
849 // #define DEBUG_US_ADD
850
851 #ifdef DEBUG_US_ADD
852 #include <stdio.h>
dump(UChar32 c)853 void dump(UChar32 c) {
854 if (c <= 0xFF) {
855 printf("%c", (char)c);
856 } else {
857 printf("U+%04X", c);
858 }
859 }
dump(const UChar32 * list,int32_t len)860 void dump(const UChar32* list, int32_t len) {
861 printf("[");
862 for (int32_t i=0; i<len; ++i) {
863 if (i != 0) printf(", ");
864 dump(list[i]);
865 }
866 printf("]");
867 }
868 #endif
869
870 /**
871 * Adds the specified character to this set if it is not already
872 * present. If this set already contains the specified character,
873 * the call leaves this set unchanged.
874 */
add(UChar32 c)875 UnicodeSet& UnicodeSet::add(UChar32 c) {
876 // find smallest i such that c < list[i]
877 // if odd, then it is IN the set
878 // if even, then it is OUT of the set
879 int32_t i = findCodePoint(pinCodePoint(c));
880
881 // already in set?
882 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
883
884 // HIGH is 0x110000
885 // assert(list[len-1] == HIGH);
886
887 // empty = [HIGH]
888 // [start_0, limit_0, start_1, limit_1, HIGH]
889
890 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
891 // ^
892 // list[i]
893
894 // i == 0 means c is before the first range
895
896 #ifdef DEBUG_US_ADD
897 printf("Add of ");
898 dump(c);
899 printf(" found at %d", i);
900 printf(": ");
901 dump(list, len);
902 printf(" => ");
903 #endif
904
905 if (c == list[i]-1) {
906 // c is before start of next range
907 list[i] = c;
908 // if we touched the HIGH mark, then add a new one
909 if (c == (UNICODESET_HIGH - 1)) {
910 UErrorCode status = U_ZERO_ERROR;
911 ensureCapacity(len+1, status);
912 if (U_FAILURE(status)) {
913 return *this; // There is no way to report this error :-(
914 }
915 list[len++] = UNICODESET_HIGH;
916 }
917 if (i > 0 && c == list[i-1]) {
918 // collapse adjacent ranges
919
920 // [..., start_k-1, c, c, limit_k, ..., HIGH]
921 // ^
922 // list[i]
923
924 //for (int32_t k=i-1; k<len-2; ++k) {
925 // list[k] = list[k+2];
926 //}
927 UChar32* dst = list + i - 1;
928 UChar32* src = dst + 2;
929 UChar32* srclimit = list + len;
930 while (src < srclimit) *(dst++) = *(src++);
931
932 len -= 2;
933 }
934 }
935
936 else if (i > 0 && c == list[i-1]) {
937 // c is after end of prior range
938 list[i-1]++;
939 // no need to check for collapse here
940 }
941
942 else {
943 // At this point we know the new char is not adjacent to
944 // any existing ranges, and it is not 10FFFF.
945
946
947 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
948 // ^
949 // list[i]
950
951 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
952 // ^
953 // list[i]
954
955 UErrorCode status = U_ZERO_ERROR;
956 ensureCapacity(len+2, status);
957 if (U_FAILURE(status)) {
958 return *this; // There is no way to report this error :-(
959 }
960
961 //for (int32_t k=len-1; k>=i; --k) {
962 // list[k+2] = list[k];
963 //}
964 UChar32* src = list + len;
965 UChar32* dst = src + 2;
966 UChar32* srclimit = list + i;
967 while (src > srclimit) *(--dst) = *(--src);
968
969 list[i] = c;
970 list[i+1] = c+1;
971 len += 2;
972 }
973
974 #ifdef DEBUG_US_ADD
975 dump(list, len);
976 printf("\n");
977
978 for (i=1; i<len; ++i) {
979 if (list[i] <= list[i-1]) {
980 // Corrupt array!
981 printf("ERROR: list has been corrupted\n");
982 exit(1);
983 }
984 }
985 #endif
986
987 releasePattern();
988 return *this;
989 }
990
991 /**
992 * Adds the specified multicharacter to this set if it is not already
993 * present. If this set already contains the multicharacter,
994 * the call leaves this set unchanged.
995 * Thus "ch" => {"ch"}
996 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
997 * @param s the source string
998 * @return the modified set, for chaining
999 */
add(const UnicodeString & s)1000 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
1001 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1002 int32_t cp = getSingleCP(s);
1003 if (cp < 0) {
1004 if (!strings->contains((void*) &s)) {
1005 _add(s);
1006 releasePattern();
1007 }
1008 } else {
1009 add((UChar32)cp);
1010 }
1011 return *this;
1012 }
1013
1014 /**
1015 * Adds the given string, in order, to 'strings'. The given string
1016 * must have been checked by the caller to not be empty and to not
1017 * already be in 'strings'.
1018 */
_add(const UnicodeString & s)1019 void UnicodeSet::_add(const UnicodeString& s) {
1020 if (isFrozen() || isBogus()) {
1021 return;
1022 }
1023 UnicodeString* t = new UnicodeString(s);
1024 if (t == NULL) { // Check for memory allocation error.
1025 setToBogus();
1026 return;
1027 }
1028 UErrorCode ec = U_ZERO_ERROR;
1029 strings->sortedInsert(t, compareUnicodeString, ec);
1030 if (U_FAILURE(ec)) {
1031 setToBogus();
1032 delete t;
1033 }
1034 }
1035
1036 /**
1037 * @return a code point IF the string consists of a single one.
1038 * otherwise returns -1.
1039 * @param string to test
1040 */
getSingleCP(const UnicodeString & s)1041 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1042 //if (s.length() < 1) {
1043 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1044 //}
1045 if (s.length() > 2) return -1;
1046 if (s.length() == 1) return s.charAt(0);
1047
1048 // at this point, len = 2
1049 UChar32 cp = s.char32At(0);
1050 if (cp > 0xFFFF) { // is surrogate pair
1051 return cp;
1052 }
1053 return -1;
1054 }
1055
1056 /**
1057 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1058 * If this set already any particular character, it has no effect on that character.
1059 * @param the source string
1060 * @return the modified set, for chaining
1061 */
addAll(const UnicodeString & s)1062 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1063 UChar32 cp;
1064 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1065 cp = s.char32At(i);
1066 add(cp);
1067 }
1068 return *this;
1069 }
1070
1071 /**
1072 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1073 * If this set already any particular character, it has no effect on that character.
1074 * @param the source string
1075 * @return the modified set, for chaining
1076 */
retainAll(const UnicodeString & s)1077 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1078 UnicodeSet set;
1079 set.addAll(s);
1080 retainAll(set);
1081 return *this;
1082 }
1083
1084 /**
1085 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1086 * If this set already any particular character, it has no effect on that character.
1087 * @param the source string
1088 * @return the modified set, for chaining
1089 */
complementAll(const UnicodeString & s)1090 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1091 UnicodeSet set;
1092 set.addAll(s);
1093 complementAll(set);
1094 return *this;
1095 }
1096
1097 /**
1098 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1099 * If this set already any particular character, it has no effect on that character.
1100 * @param the source string
1101 * @return the modified set, for chaining
1102 */
removeAll(const UnicodeString & s)1103 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1104 UnicodeSet set;
1105 set.addAll(s);
1106 removeAll(set);
1107 return *this;
1108 }
1109
removeAllStrings()1110 UnicodeSet& UnicodeSet::removeAllStrings() {
1111 strings->removeAllElements();
1112 return *this;
1113 }
1114
1115
1116 /**
1117 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1118 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1119 * @param the source string
1120 * @return a newly created set containing the given string
1121 */
createFrom(const UnicodeString & s)1122 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1123 UnicodeSet *set = new UnicodeSet();
1124 if (set != NULL) { // Check for memory allocation error.
1125 set->add(s);
1126 }
1127 return set;
1128 }
1129
1130
1131 /**
1132 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1133 * @param the source string
1134 * @return a newly created set containing the given characters
1135 */
createFromAll(const UnicodeString & s)1136 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1137 UnicodeSet *set = new UnicodeSet();
1138 if (set != NULL) { // Check for memory allocation error.
1139 set->addAll(s);
1140 }
1141 return set;
1142 }
1143
1144 /**
1145 * Retain only the elements in this set that are contained in the
1146 * specified range. If <code>end > start</code> then an empty range is
1147 * retained, leaving the set empty.
1148 *
1149 * @param start first character, inclusive, of range to be retained
1150 * to this set.
1151 * @param end last character, inclusive, of range to be retained
1152 * to this set.
1153 */
retain(UChar32 start,UChar32 end)1154 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1155 if (pinCodePoint(start) <= pinCodePoint(end)) {
1156 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1157 retain(range, 2, 0);
1158 } else {
1159 clear();
1160 }
1161 return *this;
1162 }
1163
retain(UChar32 c)1164 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1165 return retain(c, c);
1166 }
1167
1168 /**
1169 * Removes the specified range from this set if it is present.
1170 * The set will not contain the specified range once the call
1171 * returns. If <code>end > start</code> then an empty range is
1172 * removed, leaving the set unchanged.
1173 *
1174 * @param start first character, inclusive, of range to be removed
1175 * from this set.
1176 * @param end last character, inclusive, of range to be removed
1177 * from this set.
1178 */
remove(UChar32 start,UChar32 end)1179 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1180 if (pinCodePoint(start) <= pinCodePoint(end)) {
1181 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1182 retain(range, 2, 2);
1183 }
1184 return *this;
1185 }
1186
1187 /**
1188 * Removes the specified character from this set if it is present.
1189 * The set will not contain the specified range once the call
1190 * returns.
1191 */
remove(UChar32 c)1192 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1193 return remove(c, c);
1194 }
1195
1196 /**
1197 * Removes the specified string from this set if it is present.
1198 * The set will not contain the specified character once the call
1199 * returns.
1200 * @param the source string
1201 * @return the modified set, for chaining
1202 */
remove(const UnicodeString & s)1203 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1204 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1205 int32_t cp = getSingleCP(s);
1206 if (cp < 0) {
1207 strings->removeElement((void*) &s);
1208 releasePattern();
1209 } else {
1210 remove((UChar32)cp, (UChar32)cp);
1211 }
1212 return *this;
1213 }
1214
1215 /**
1216 * Complements the specified range in this set. Any character in
1217 * the range will be removed if it is in this set, or will be
1218 * added if it is not in this set. If <code>end > start</code>
1219 * then an empty range is xor'ed, leaving the set unchanged.
1220 *
1221 * @param start first character, inclusive, of range to be removed
1222 * from this set.
1223 * @param end last character, inclusive, of range to be removed
1224 * from this set.
1225 */
complement(UChar32 start,UChar32 end)1226 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1227 if (isFrozen() || isBogus()) {
1228 return *this;
1229 }
1230 if (pinCodePoint(start) <= pinCodePoint(end)) {
1231 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1232 exclusiveOr(range, 2, 0);
1233 }
1234 releasePattern();
1235 return *this;
1236 }
1237
complement(UChar32 c)1238 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1239 return complement(c, c);
1240 }
1241
1242 /**
1243 * This is equivalent to
1244 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1245 */
complement(void)1246 UnicodeSet& UnicodeSet::complement(void) {
1247 if (isFrozen() || isBogus()) {
1248 return *this;
1249 }
1250 UErrorCode status = U_ZERO_ERROR;
1251 if (list[0] == UNICODESET_LOW) {
1252 ensureBufferCapacity(len-1, status);
1253 if (U_FAILURE(status)) {
1254 return *this;
1255 }
1256 uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1257 --len;
1258 } else {
1259 ensureBufferCapacity(len+1, status);
1260 if (U_FAILURE(status)) {
1261 return *this;
1262 }
1263 uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1264 buffer[0] = UNICODESET_LOW;
1265 ++len;
1266 }
1267 swapBuffers();
1268 releasePattern();
1269 return *this;
1270 }
1271
1272 /**
1273 * Complement the specified string in this set.
1274 * The set will not contain the specified string once the call
1275 * returns.
1276 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1277 * @param s the string to complement
1278 * @return this object, for chaining
1279 */
complement(const UnicodeString & s)1280 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1281 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1282 int32_t cp = getSingleCP(s);
1283 if (cp < 0) {
1284 if (strings->contains((void*) &s)) {
1285 strings->removeElement((void*) &s);
1286 } else {
1287 _add(s);
1288 }
1289 releasePattern();
1290 } else {
1291 complement((UChar32)cp, (UChar32)cp);
1292 }
1293 return *this;
1294 }
1295
1296 /**
1297 * Adds all of the elements in the specified set to this set if
1298 * they're not already present. This operation effectively
1299 * modifies this set so that its value is the <i>union</i> of the two
1300 * sets. The behavior of this operation is unspecified if the specified
1301 * collection is modified while the operation is in progress.
1302 *
1303 * @param c set whose elements are to be added to this set.
1304 * @see #add(char, char)
1305 */
addAll(const UnicodeSet & c)1306 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1307 if ( c.len>0 && c.list!=NULL ) {
1308 add(c.list, c.len, 0);
1309 }
1310
1311 // Add strings in order
1312 if ( c.strings!=NULL ) {
1313 for (int32_t i=0; i<c.strings->size(); ++i) {
1314 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1315 if (!strings->contains((void*) s)) {
1316 _add(*s);
1317 }
1318 }
1319 }
1320 return *this;
1321 }
1322
1323 /**
1324 * Retains only the elements in this set that are contained in the
1325 * specified set. In other words, removes from this set all of
1326 * its elements that are not contained in the specified set. This
1327 * operation effectively modifies this set so that its value is
1328 * the <i>intersection</i> of the two sets.
1329 *
1330 * @param c set that defines which elements this set will retain.
1331 */
retainAll(const UnicodeSet & c)1332 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1333 if (isFrozen() || isBogus()) {
1334 return *this;
1335 }
1336 retain(c.list, c.len, 0);
1337 strings->retainAll(*c.strings);
1338 return *this;
1339 }
1340
1341 /**
1342 * Removes from this set all of its elements that are contained in the
1343 * specified set. This operation effectively modifies this
1344 * set so that its value is the <i>asymmetric set difference</i> of
1345 * the two sets.
1346 *
1347 * @param c set that defines which elements will be removed from
1348 * this set.
1349 */
removeAll(const UnicodeSet & c)1350 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1351 if (isFrozen() || isBogus()) {
1352 return *this;
1353 }
1354 retain(c.list, c.len, 2);
1355 strings->removeAll(*c.strings);
1356 return *this;
1357 }
1358
1359 /**
1360 * Complements in this set all elements contained in the specified
1361 * set. Any character in the other set will be removed if it is
1362 * in this set, or will be added if it is not in this set.
1363 *
1364 * @param c set that defines which elements will be xor'ed from
1365 * this set.
1366 */
complementAll(const UnicodeSet & c)1367 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1368 if (isFrozen() || isBogus()) {
1369 return *this;
1370 }
1371 exclusiveOr(c.list, c.len, 0);
1372
1373 for (int32_t i=0; i<c.strings->size(); ++i) {
1374 void* e = c.strings->elementAt(i);
1375 if (!strings->removeElement(e)) {
1376 _add(*(const UnicodeString*)e);
1377 }
1378 }
1379 return *this;
1380 }
1381
1382 /**
1383 * Removes all of the elements from this set. This set will be
1384 * empty after this call returns.
1385 */
clear(void)1386 UnicodeSet& UnicodeSet::clear(void) {
1387 if (isFrozen()) {
1388 return *this;
1389 }
1390 if (list != NULL) {
1391 list[0] = UNICODESET_HIGH;
1392 }
1393 len = 1;
1394 releasePattern();
1395 if (strings != NULL) {
1396 strings->removeAllElements();
1397 }
1398 if (list != NULL && strings != NULL) {
1399 // Remove bogus
1400 fFlags = 0;
1401 }
1402 return *this;
1403 }
1404
1405 /**
1406 * Iteration method that returns the number of ranges contained in
1407 * this set.
1408 * @see #getRangeStart
1409 * @see #getRangeEnd
1410 */
getRangeCount() const1411 int32_t UnicodeSet::getRangeCount() const {
1412 return len/2;
1413 }
1414
1415 /**
1416 * Iteration method that returns the first character in the
1417 * specified range of this set.
1418 * @see #getRangeCount
1419 * @see #getRangeEnd
1420 */
getRangeStart(int32_t index) const1421 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1422 return list[index*2];
1423 }
1424
1425 /**
1426 * Iteration method that returns the last character in the
1427 * specified range of this set.
1428 * @see #getRangeStart
1429 * @see #getRangeEnd
1430 */
getRangeEnd(int32_t index) const1431 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1432 return list[index*2 + 1] - 1;
1433 }
1434
getStringCount() const1435 int32_t UnicodeSet::getStringCount() const {
1436 return strings->size();
1437 }
1438
getString(int32_t index) const1439 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1440 return (const UnicodeString*) strings->elementAt(index);
1441 }
1442
1443 /**
1444 * Reallocate this objects internal structures to take up the least
1445 * possible space, without changing this object's value.
1446 */
compact()1447 UnicodeSet& UnicodeSet::compact() {
1448 if (isFrozen() || isBogus()) {
1449 return *this;
1450 }
1451 // Delete buffer first to defragment memory less.
1452 if (buffer != NULL) {
1453 uprv_free(buffer);
1454 buffer = NULL;
1455 }
1456 if (len < capacity) {
1457 // Make the capacity equal to len or 1.
1458 // We don't want to realloc of 0 size.
1459 int32_t newCapacity = len + (len == 0);
1460 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1461 if (temp) {
1462 list = temp;
1463 capacity = newCapacity;
1464 }
1465 // else what the heck happened?! We allocated less memory!
1466 // Oh well. We'll keep our original array.
1467 }
1468 return *this;
1469 }
1470
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1471 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1472 int32_t bmpLength, length, destLength;
1473
1474 if (U_FAILURE(ec)) {
1475 return 0;
1476 }
1477
1478 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1479 ec=U_ILLEGAL_ARGUMENT_ERROR;
1480 return 0;
1481 }
1482
1483 /* count necessary 16-bit units */
1484 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1485 // assert(length>=0);
1486 if (length==0) {
1487 /* empty set */
1488 if (destCapacity>0) {
1489 *dest=0;
1490 } else {
1491 ec=U_BUFFER_OVERFLOW_ERROR;
1492 }
1493 return 1;
1494 }
1495 /* now length>0 */
1496
1497 if (this->list[length-1]<=0xffff) {
1498 /* all BMP */
1499 bmpLength=length;
1500 } else if (this->list[0]>=0x10000) {
1501 /* all supplementary */
1502 bmpLength=0;
1503 length*=2;
1504 } else {
1505 /* some BMP, some supplementary */
1506 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1507 length=bmpLength+2*(length-bmpLength);
1508 }
1509
1510 /* length: number of 16-bit array units */
1511 if (length>0x7fff) {
1512 /* there are only 15 bits for the length in the first serialized word */
1513 ec=U_INDEX_OUTOFBOUNDS_ERROR;
1514 return 0;
1515 }
1516
1517 /*
1518 * total serialized length:
1519 * number of 16-bit array units (length) +
1520 * 1 length unit (always) +
1521 * 1 bmpLength unit (if there are supplementary values)
1522 */
1523 destLength=length+((length>bmpLength)?2:1);
1524 if (destLength<=destCapacity) {
1525 const UChar32 *p;
1526 int32_t i;
1527
1528 *dest=(uint16_t)length;
1529 if (length>bmpLength) {
1530 *dest|=0x8000;
1531 *++dest=(uint16_t)bmpLength;
1532 }
1533 ++dest;
1534
1535 /* write the BMP part of the array */
1536 p=this->list;
1537 for (i=0; i<bmpLength; ++i) {
1538 *dest++=(uint16_t)*p++;
1539 }
1540
1541 /* write the supplementary part of the array */
1542 for (; i<length; i+=2) {
1543 *dest++=(uint16_t)(*p>>16);
1544 *dest++=(uint16_t)*p++;
1545 }
1546 } else {
1547 ec=U_BUFFER_OVERFLOW_ERROR;
1548 }
1549 return destLength;
1550 }
1551
1552 //----------------------------------------------------------------
1553 // Implementation: Utility methods
1554 //----------------------------------------------------------------
1555
1556 /**
1557 * Allocate our strings vector and return TRUE if successful.
1558 */
allocateStrings(UErrorCode & status)1559 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1560 if (U_FAILURE(status)) {
1561 return FALSE;
1562 }
1563 strings = new UVector(uprv_deleteUObject,
1564 uhash_compareUnicodeString, 1, status);
1565 if (strings == NULL) { // Check for memory allocation error.
1566 status = U_MEMORY_ALLOCATION_ERROR;
1567 return FALSE;
1568 }
1569 if (U_FAILURE(status)) {
1570 delete strings;
1571 strings = NULL;
1572 return FALSE;
1573 }
1574 return TRUE;
1575 }
1576
ensureCapacity(int32_t newLen,UErrorCode & ec)1577 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1578 if (newLen <= capacity)
1579 return;
1580 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1581 if (temp == NULL) {
1582 ec = U_MEMORY_ALLOCATION_ERROR;
1583 setToBogus();
1584 return;
1585 }
1586 list = temp;
1587 capacity = newLen + GROW_EXTRA;
1588 // else we keep the original contents on the memory failure.
1589 }
1590
ensureBufferCapacity(int32_t newLen,UErrorCode & ec)1591 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1592 if (buffer != NULL && newLen <= bufferCapacity)
1593 return;
1594 UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1595 if (temp == NULL) {
1596 ec = U_MEMORY_ALLOCATION_ERROR;
1597 setToBogus();
1598 return;
1599 }
1600 buffer = temp;
1601 bufferCapacity = newLen + GROW_EXTRA;
1602 // else we keep the original contents on the memory failure.
1603 }
1604
1605 /**
1606 * Swap list and buffer.
1607 */
swapBuffers(void)1608 void UnicodeSet::swapBuffers(void) {
1609 // swap list and buffer
1610 UChar32* temp = list;
1611 list = buffer;
1612 buffer = temp;
1613
1614 int32_t c = capacity;
1615 capacity = bufferCapacity;
1616 bufferCapacity = c;
1617 }
1618
setToBogus()1619 void UnicodeSet::setToBogus() {
1620 clear(); // Remove everything in the set.
1621 fFlags = kIsBogus;
1622 }
1623
1624 //----------------------------------------------------------------
1625 // Implementation: Fundamental operators
1626 //----------------------------------------------------------------
1627
max(UChar32 a,UChar32 b)1628 static inline UChar32 max(UChar32 a, UChar32 b) {
1629 return (a > b) ? a : b;
1630 }
1631
1632 // polarity = 0, 3 is normal: x xor y
1633 // polarity = 1, 2: x xor ~y == x === y
1634
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1635 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1636 if (isFrozen() || isBogus()) {
1637 return;
1638 }
1639 UErrorCode status = U_ZERO_ERROR;
1640 ensureBufferCapacity(len + otherLen, status);
1641 if (U_FAILURE(status)) {
1642 return;
1643 }
1644
1645 int32_t i = 0, j = 0, k = 0;
1646 UChar32 a = list[i++];
1647 UChar32 b;
1648 if (polarity == 1 || polarity == 2) {
1649 b = UNICODESET_LOW;
1650 if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1651 ++j;
1652 b = other[j];
1653 }
1654 } else {
1655 b = other[j++];
1656 }
1657 // simplest of all the routines
1658 // sort the values, discarding identicals!
1659 for (;;) {
1660 if (a < b) {
1661 buffer[k++] = a;
1662 a = list[i++];
1663 } else if (b < a) {
1664 buffer[k++] = b;
1665 b = other[j++];
1666 } else if (a != UNICODESET_HIGH) { // at this point, a == b
1667 // discard both values!
1668 a = list[i++];
1669 b = other[j++];
1670 } else { // DONE!
1671 buffer[k++] = UNICODESET_HIGH;
1672 len = k;
1673 break;
1674 }
1675 }
1676 swapBuffers();
1677 releasePattern();
1678 }
1679
1680 // polarity = 0 is normal: x union y
1681 // polarity = 2: x union ~y
1682 // polarity = 1: ~x union y
1683 // polarity = 3: ~x union ~y
1684
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1685 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1686 if (isFrozen() || isBogus() || other==NULL) {
1687 return;
1688 }
1689 UErrorCode status = U_ZERO_ERROR;
1690 ensureBufferCapacity(len + otherLen, status);
1691 if (U_FAILURE(status)) {
1692 return;
1693 }
1694
1695 int32_t i = 0, j = 0, k = 0;
1696 UChar32 a = list[i++];
1697 UChar32 b = other[j++];
1698 // change from xor is that we have to check overlapping pairs
1699 // polarity bit 1 means a is second, bit 2 means b is.
1700 for (;;) {
1701 switch (polarity) {
1702 case 0: // both first; take lower if unequal
1703 if (a < b) { // take a
1704 // Back up over overlapping ranges in buffer[]
1705 if (k > 0 && a <= buffer[k-1]) {
1706 // Pick latter end value in buffer[] vs. list[]
1707 a = max(list[i], buffer[--k]);
1708 } else {
1709 // No overlap
1710 buffer[k++] = a;
1711 a = list[i];
1712 }
1713 i++; // Common if/else code factored out
1714 polarity ^= 1;
1715 } else if (b < a) { // take b
1716 if (k > 0 && b <= buffer[k-1]) {
1717 b = max(other[j], buffer[--k]);
1718 } else {
1719 buffer[k++] = b;
1720 b = other[j];
1721 }
1722 j++;
1723 polarity ^= 2;
1724 } else { // a == b, take a, drop b
1725 if (a == UNICODESET_HIGH) goto loop_end;
1726 // This is symmetrical; it doesn't matter if
1727 // we backtrack with a or b. - liu
1728 if (k > 0 && a <= buffer[k-1]) {
1729 a = max(list[i], buffer[--k]);
1730 } else {
1731 // No overlap
1732 buffer[k++] = a;
1733 a = list[i];
1734 }
1735 i++;
1736 polarity ^= 1;
1737 b = other[j++];
1738 polarity ^= 2;
1739 }
1740 break;
1741 case 3: // both second; take higher if unequal, and drop other
1742 if (b <= a) { // take a
1743 if (a == UNICODESET_HIGH) goto loop_end;
1744 buffer[k++] = a;
1745 } else { // take b
1746 if (b == UNICODESET_HIGH) goto loop_end;
1747 buffer[k++] = b;
1748 }
1749 a = list[i++];
1750 polarity ^= 1; // factored common code
1751 b = other[j++];
1752 polarity ^= 2;
1753 break;
1754 case 1: // a second, b first; if b < a, overlap
1755 if (a < b) { // no overlap, take a
1756 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1757 } else if (b < a) { // OVERLAP, drop b
1758 b = other[j++];
1759 polarity ^= 2;
1760 } else { // a == b, drop both!
1761 if (a == UNICODESET_HIGH) goto loop_end;
1762 a = list[i++];
1763 polarity ^= 1;
1764 b = other[j++];
1765 polarity ^= 2;
1766 }
1767 break;
1768 case 2: // a first, b second; if a < b, overlap
1769 if (b < a) { // no overlap, take b
1770 buffer[k++] = b;
1771 b = other[j++];
1772 polarity ^= 2;
1773 } else if (a < b) { // OVERLAP, drop a
1774 a = list[i++];
1775 polarity ^= 1;
1776 } else { // a == b, drop both!
1777 if (a == UNICODESET_HIGH) goto loop_end;
1778 a = list[i++];
1779 polarity ^= 1;
1780 b = other[j++];
1781 polarity ^= 2;
1782 }
1783 break;
1784 }
1785 }
1786 loop_end:
1787 buffer[k++] = UNICODESET_HIGH; // terminate
1788 len = k;
1789 swapBuffers();
1790 releasePattern();
1791 }
1792
1793 // polarity = 0 is normal: x intersect y
1794 // polarity = 2: x intersect ~y == set-minus
1795 // polarity = 1: ~x intersect y
1796 // polarity = 3: ~x intersect ~y
1797
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1798 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1799 if (isFrozen() || isBogus()) {
1800 return;
1801 }
1802 UErrorCode status = U_ZERO_ERROR;
1803 ensureBufferCapacity(len + otherLen, status);
1804 if (U_FAILURE(status)) {
1805 return;
1806 }
1807
1808 int32_t i = 0, j = 0, k = 0;
1809 UChar32 a = list[i++];
1810 UChar32 b = other[j++];
1811 // change from xor is that we have to check overlapping pairs
1812 // polarity bit 1 means a is second, bit 2 means b is.
1813 for (;;) {
1814 switch (polarity) {
1815 case 0: // both first; drop the smaller
1816 if (a < b) { // drop a
1817 a = list[i++];
1818 polarity ^= 1;
1819 } else if (b < a) { // drop b
1820 b = other[j++];
1821 polarity ^= 2;
1822 } else { // a == b, take one, drop other
1823 if (a == UNICODESET_HIGH) goto loop_end;
1824 buffer[k++] = a;
1825 a = list[i++];
1826 polarity ^= 1;
1827 b = other[j++];
1828 polarity ^= 2;
1829 }
1830 break;
1831 case 3: // both second; take lower if unequal
1832 if (a < b) { // take a
1833 buffer[k++] = a;
1834 a = list[i++];
1835 polarity ^= 1;
1836 } else if (b < a) { // take b
1837 buffer[k++] = b;
1838 b = other[j++];
1839 polarity ^= 2;
1840 } else { // a == b, take one, drop other
1841 if (a == UNICODESET_HIGH) goto loop_end;
1842 buffer[k++] = a;
1843 a = list[i++];
1844 polarity ^= 1;
1845 b = other[j++];
1846 polarity ^= 2;
1847 }
1848 break;
1849 case 1: // a second, b first;
1850 if (a < b) { // NO OVERLAP, drop a
1851 a = list[i++];
1852 polarity ^= 1;
1853 } else if (b < a) { // OVERLAP, take b
1854 buffer[k++] = b;
1855 b = other[j++];
1856 polarity ^= 2;
1857 } else { // a == b, drop both!
1858 if (a == UNICODESET_HIGH) goto loop_end;
1859 a = list[i++];
1860 polarity ^= 1;
1861 b = other[j++];
1862 polarity ^= 2;
1863 }
1864 break;
1865 case 2: // a first, b second; if a < b, overlap
1866 if (b < a) { // no overlap, drop b
1867 b = other[j++];
1868 polarity ^= 2;
1869 } else if (a < b) { // OVERLAP, take a
1870 buffer[k++] = a;
1871 a = list[i++];
1872 polarity ^= 1;
1873 } else { // a == b, drop both!
1874 if (a == UNICODESET_HIGH) goto loop_end;
1875 a = list[i++];
1876 polarity ^= 1;
1877 b = other[j++];
1878 polarity ^= 2;
1879 }
1880 break;
1881 }
1882 }
1883 loop_end:
1884 buffer[k++] = UNICODESET_HIGH; // terminate
1885 len = k;
1886 swapBuffers();
1887 releasePattern();
1888 }
1889
1890 /**
1891 * Append the <code>toPattern()</code> representation of a
1892 * string to the given <code>StringBuffer</code>.
1893 */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1894 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1895 escapeUnprintable) {
1896 UChar32 cp;
1897 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1898 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1899 }
1900 }
1901
1902 /**
1903 * Append the <code>toPattern()</code> representation of a
1904 * character to the given <code>StringBuffer</code>.
1905 */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1906 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1907 escapeUnprintable) {
1908 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1909 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1910 // unprintable
1911 if (ICU_Utility::escapeUnprintable(buf, c)) {
1912 return;
1913 }
1914 }
1915 // Okay to let ':' pass through
1916 switch (c) {
1917 case SET_OPEN:
1918 case SET_CLOSE:
1919 case HYPHEN:
1920 case COMPLEMENT:
1921 case INTERSECTION:
1922 case BACKSLASH:
1923 case OPEN_BRACE:
1924 case CLOSE_BRACE:
1925 case COLON:
1926 case SymbolTable::SYMBOL_REF:
1927 buf.append(BACKSLASH);
1928 break;
1929 default:
1930 // Escape whitespace
1931 if (PatternProps::isWhiteSpace(c)) {
1932 buf.append(BACKSLASH);
1933 }
1934 break;
1935 }
1936 buf.append(c);
1937 }
1938
1939 /**
1940 * Append a string representation of this set to result. This will be
1941 * a cleaned version of the string passed to applyPattern(), if there
1942 * is one. Otherwise it will be generated.
1943 */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const1944 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
1945 UBool escapeUnprintable) const
1946 {
1947 if (pat != NULL) {
1948 int32_t i;
1949 int32_t backslashCount = 0;
1950 for (i=0; i<patLen; ) {
1951 UChar32 c;
1952 U16_NEXT(pat, i, patLen, c);
1953 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1954 // If the unprintable character is preceded by an odd
1955 // number of backslashes, then it has been escaped.
1956 // Before unescaping it, we delete the final
1957 // backslash.
1958 if ((backslashCount % 2) == 1) {
1959 result.truncate(result.length() - 1);
1960 }
1961 ICU_Utility::escapeUnprintable(result, c);
1962 backslashCount = 0;
1963 } else {
1964 result.append(c);
1965 if (c == BACKSLASH) {
1966 ++backslashCount;
1967 } else {
1968 backslashCount = 0;
1969 }
1970 }
1971 }
1972 return result;
1973 }
1974
1975 return _generatePattern(result, escapeUnprintable);
1976 }
1977
1978 /**
1979 * Returns a string representation of this set. If the result of
1980 * calling this function is passed to a UnicodeSet constructor, it
1981 * will produce another set that is equal to this one.
1982 */
toPattern(UnicodeString & result,UBool escapeUnprintable) const1983 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
1984 UBool escapeUnprintable) const
1985 {
1986 result.truncate(0);
1987 return _toPattern(result, escapeUnprintable);
1988 }
1989
1990 /**
1991 * Generate and append a string representation of this set to result.
1992 * This does not use this.pat, the cleaned up copy of the string
1993 * passed to applyPattern().
1994 */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const1995 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
1996 UBool escapeUnprintable) const
1997 {
1998 result.append(SET_OPEN);
1999
2000 // // Check against the predefined categories. We implicitly build
2001 // // up ALL category sets the first time toPattern() is called.
2002 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2003 // if (*this == getCategorySet(cat)) {
2004 // result.append(COLON);
2005 // result.append(CATEGORY_NAMES, cat*2, 2);
2006 // return result.append(CATEGORY_CLOSE);
2007 // }
2008 // }
2009
2010 int32_t count = getRangeCount();
2011
2012 // If the set contains at least 2 intervals and includes both
2013 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2014 // be more economical.
2015 if (count > 1 &&
2016 getRangeStart(0) == MIN_VALUE &&
2017 getRangeEnd(count-1) == MAX_VALUE) {
2018
2019 // Emit the inverse
2020 result.append(COMPLEMENT);
2021
2022 for (int32_t i = 1; i < count; ++i) {
2023 UChar32 start = getRangeEnd(i-1)+1;
2024 UChar32 end = getRangeStart(i)-1;
2025 _appendToPat(result, start, escapeUnprintable);
2026 if (start != end) {
2027 if ((start+1) != end) {
2028 result.append(HYPHEN);
2029 }
2030 _appendToPat(result, end, escapeUnprintable);
2031 }
2032 }
2033 }
2034
2035 // Default; emit the ranges as pairs
2036 else {
2037 for (int32_t i = 0; i < count; ++i) {
2038 UChar32 start = getRangeStart(i);
2039 UChar32 end = getRangeEnd(i);
2040 _appendToPat(result, start, escapeUnprintable);
2041 if (start != end) {
2042 if ((start+1) != end) {
2043 result.append(HYPHEN);
2044 }
2045 _appendToPat(result, end, escapeUnprintable);
2046 }
2047 }
2048 }
2049
2050 for (int32_t i = 0; i<strings->size(); ++i) {
2051 result.append(OPEN_BRACE);
2052 _appendToPat(result,
2053 *(const UnicodeString*) strings->elementAt(i),
2054 escapeUnprintable);
2055 result.append(CLOSE_BRACE);
2056 }
2057 return result.append(SET_CLOSE);
2058 }
2059
2060 /**
2061 * Release existing cached pattern
2062 */
releasePattern()2063 void UnicodeSet::releasePattern() {
2064 if (pat) {
2065 uprv_free(pat);
2066 pat = NULL;
2067 patLen = 0;
2068 }
2069 }
2070
2071 /**
2072 * Set the new pattern to cache.
2073 */
setPattern(const UnicodeString & newPat)2074 void UnicodeSet::setPattern(const UnicodeString& newPat) {
2075 releasePattern();
2076 int32_t newPatLen = newPat.length();
2077 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2078 if (pat) {
2079 patLen = newPatLen;
2080 newPat.extractBetween(0, patLen, pat);
2081 pat[patLen] = 0;
2082 }
2083 // else we don't care if malloc failed. This was just a nice cache.
2084 // We can regenerate an equivalent pattern later when requested.
2085 }
2086
freeze()2087 UnicodeFunctor *UnicodeSet::freeze() {
2088 if(!isFrozen() && !isBogus()) {
2089 // Do most of what compact() does before freezing because
2090 // compact() will not work when the set is frozen.
2091 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2092
2093 // Delete buffer first to defragment memory less.
2094 if (buffer != NULL) {
2095 uprv_free(buffer);
2096 buffer = NULL;
2097 }
2098 if (capacity > (len + GROW_EXTRA)) {
2099 // Make the capacity equal to len or 1.
2100 // We don't want to realloc of 0 size.
2101 capacity = len + (len == 0);
2102 list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2103 if (list == NULL) { // Check for memory allocation error.
2104 setToBogus();
2105 return this;
2106 }
2107 }
2108
2109 // Optimize contains() and span() and similar functions.
2110 if (!strings->isEmpty()) {
2111 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2112 if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2113 // All strings are irrelevant for span() etc. because
2114 // all of each string's code points are contained in this set.
2115 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2116 // many relevant strings as UTF-16.
2117 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2118 delete stringSpan;
2119 stringSpan = NULL;
2120 }
2121 }
2122 if (stringSpan == NULL) {
2123 // No span-relevant strings: Optimize for code point spans.
2124 bmpSet=new BMPSet(list, len);
2125 if (bmpSet == NULL) { // Check for memory allocation error.
2126 setToBogus();
2127 }
2128 }
2129 }
2130 return this;
2131 }
2132
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2133 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2134 if(length>0 && bmpSet!=NULL) {
2135 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2136 }
2137 if(length<0) {
2138 length=u_strlen(s);
2139 }
2140 if(length==0) {
2141 return 0;
2142 }
2143 if(stringSpan!=NULL) {
2144 return stringSpan->span(s, length, spanCondition);
2145 } else if(!strings->isEmpty()) {
2146 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2147 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2148 UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2149 UnicodeSetStringSpan strSpan(*this, *strings, which);
2150 if(strSpan.needsStringSpanUTF16()) {
2151 return strSpan.span(s, length, spanCondition);
2152 }
2153 }
2154
2155 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2156 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2157 }
2158
2159 UChar32 c;
2160 int32_t start=0, prev=0;
2161 do {
2162 U16_NEXT(s, start, length, c);
2163 if(spanCondition!=contains(c)) {
2164 break;
2165 }
2166 } while((prev=start)<length);
2167 return prev;
2168 }
2169
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2170 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2171 if(length>0 && bmpSet!=NULL) {
2172 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2173 }
2174 if(length<0) {
2175 length=u_strlen(s);
2176 }
2177 if(length==0) {
2178 return 0;
2179 }
2180 if(stringSpan!=NULL) {
2181 return stringSpan->spanBack(s, length, spanCondition);
2182 } else if(!strings->isEmpty()) {
2183 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2184 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2185 UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2186 UnicodeSetStringSpan strSpan(*this, *strings, which);
2187 if(strSpan.needsStringSpanUTF16()) {
2188 return strSpan.spanBack(s, length, spanCondition);
2189 }
2190 }
2191
2192 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2193 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2194 }
2195
2196 UChar32 c;
2197 int32_t prev=length;
2198 do {
2199 U16_PREV(s, 0, length, c);
2200 if(spanCondition!=contains(c)) {
2201 break;
2202 }
2203 } while((prev=length)>0);
2204 return prev;
2205 }
2206
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2207 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2208 if(length>0 && bmpSet!=NULL) {
2209 const uint8_t *s0=(const uint8_t *)s;
2210 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2211 }
2212 if(length<0) {
2213 length=(int32_t)uprv_strlen(s);
2214 }
2215 if(length==0) {
2216 return 0;
2217 }
2218 if(stringSpan!=NULL) {
2219 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2220 } else if(!strings->isEmpty()) {
2221 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2222 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2223 UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2224 UnicodeSetStringSpan strSpan(*this, *strings, which);
2225 if(strSpan.needsStringSpanUTF8()) {
2226 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2227 }
2228 }
2229
2230 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2231 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2232 }
2233
2234 UChar32 c;
2235 int32_t start=0, prev=0;
2236 do {
2237 U8_NEXT(s, start, length, c);
2238 if(c<0) {
2239 c=0xfffd;
2240 }
2241 if(spanCondition!=contains(c)) {
2242 break;
2243 }
2244 } while((prev=start)<length);
2245 return prev;
2246 }
2247
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2248 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2249 if(length>0 && bmpSet!=NULL) {
2250 const uint8_t *s0=(const uint8_t *)s;
2251 return bmpSet->spanBackUTF8(s0, length, spanCondition);
2252 }
2253 if(length<0) {
2254 length=(int32_t)uprv_strlen(s);
2255 }
2256 if(length==0) {
2257 return 0;
2258 }
2259 if(stringSpan!=NULL) {
2260 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2261 } else if(!strings->isEmpty()) {
2262 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2263 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2264 UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2265 UnicodeSetStringSpan strSpan(*this, *strings, which);
2266 if(strSpan.needsStringSpanUTF8()) {
2267 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2268 }
2269 }
2270
2271 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2272 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2273 }
2274
2275 UChar32 c;
2276 int32_t prev=length;
2277 do {
2278 U8_PREV(s, 0, length, c);
2279 if(c<0) {
2280 c=0xfffd;
2281 }
2282 if(spanCondition!=contains(c)) {
2283 break;
2284 }
2285 } while((prev=length)>0);
2286 return prev;
2287 }
2288
2289 U_NAMESPACE_END
2290