1 /*
2 **********************************************************************
3 * Copyright (C) 1999-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 **********************************************************************
9 */
10
11 #include "unicode/utypes.h"
12 #include "unicode/parsepos.h"
13 #include "unicode/symtable.h"
14 #include "unicode/uniset.h"
15 #include "unicode/utf8.h"
16 #include "unicode/utf16.h"
17 #include "ruleiter.h"
18 #include "cmemory.h"
19 #include "cstring.h"
20 #include "patternprops.h"
21 #include "uelement.h"
22 #include "util.h"
23 #include "uvector.h"
24 #include "charstr.h"
25 #include "ustrfmt.h"
26 #include "uassert.h"
27 #include "bmpset.h"
28 #include "unisetspan.h"
29
30 // Define UChar constants using hex for EBCDIC compatibility
31 // Used #define to reduce private static exports and memory access time.
32 #define SET_OPEN ((UChar)0x005B) /*[*/
33 #define SET_CLOSE ((UChar)0x005D) /*]*/
34 #define HYPHEN ((UChar)0x002D) /*-*/
35 #define COMPLEMENT ((UChar)0x005E) /*^*/
36 #define COLON ((UChar)0x003A) /*:*/
37 #define BACKSLASH ((UChar)0x005C) /*\*/
38 #define INTERSECTION ((UChar)0x0026) /*&*/
39 #define UPPER_U ((UChar)0x0055) /*U*/
40 #define LOWER_U ((UChar)0x0075) /*u*/
41 #define OPEN_BRACE ((UChar)123) /*{*/
42 #define CLOSE_BRACE ((UChar)125) /*}*/
43 #define UPPER_P ((UChar)0x0050) /*P*/
44 #define LOWER_P ((UChar)0x0070) /*p*/
45 #define UPPER_N ((UChar)78) /*N*/
46 #define EQUALS ((UChar)0x003D) /*=*/
47
48 // HIGH_VALUE > all valid values. 110000 for codepoints
49 #define UNICODESET_HIGH 0x0110000
50
51 // LOW <= all valid values. ZERO for codepoints
52 #define UNICODESET_LOW 0x000000
53
54 // initial storage. Must be >= 0
55 #define START_EXTRA 16
56
57 // extra amount for growth. Must be >= 0
58 #define GROW_EXTRA START_EXTRA
59
60 U_NAMESPACE_BEGIN
61
~SymbolTable()62 SymbolTable::~SymbolTable() {}
63
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
65
66 /**
67 * Modify the given UChar32 variable so that it is in range, by
68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
70 * It modifies its argument in-place and also returns it.
71 */
72 static inline UChar32 pinCodePoint(UChar32& c) {
73 if (c < UNICODESET_LOW) {
74 c = UNICODESET_LOW;
75 } else if (c > (UNICODESET_HIGH-1)) {
76 c = (UNICODESET_HIGH-1);
77 }
78 return c;
79 }
80
81 //----------------------------------------------------------------
82 // Debugging
83 //----------------------------------------------------------------
84
85 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
86 // To enable the debugging, define the symbol DEBUG_MEM in the line
87 // below. This will result in text being sent to stdout that looks
88 // like this:
89 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
90 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
91 // Each line lists a construction (ct) or destruction (dt) event, the
92 // object address, the number of outstanding objects after the event,
93 // and the pattern of the object in question.
94
95 // #define DEBUG_MEM
96
97 #ifdef DEBUG_MEM
98 #include <stdio.h>
99 static int32_t _dbgCount = 0;
100
_dbgct(UnicodeSet * set)101 static inline void _dbgct(UnicodeSet* set) {
102 UnicodeString str;
103 set->toPattern(str, TRUE);
104 char buf[40];
105 str.extract(0, 39, buf, "");
106 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
107 }
108
_dbgdt(UnicodeSet * set)109 static inline void _dbgdt(UnicodeSet* set) {
110 UnicodeString str;
111 set->toPattern(str, TRUE);
112 char buf[40];
113 str.extract(0, 39, buf, "");
114 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
115 }
116
117 #else
118
119 #define _dbgct(set)
120 #define _dbgdt(set)
121
122 #endif
123
124 //----------------------------------------------------------------
125 // UnicodeString in UVector support
126 //----------------------------------------------------------------
127
cloneUnicodeString(UElement * dst,UElement * src)128 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
129 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
130 }
131
compareUnicodeString(UElement t1,UElement t2)132 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
133 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
134 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
135 return a.compare(b);
136 }
137
138 //----------------------------------------------------------------
139 // Constructors &c
140 //----------------------------------------------------------------
141
142 /**
143 * Constructs an empty set.
144 */
UnicodeSet()145 UnicodeSet::UnicodeSet() :
146 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
147 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
148 fFlags(0)
149 {
150 UErrorCode status = U_ZERO_ERROR;
151 allocateStrings(status);
152 if (U_FAILURE(status)) {
153 return;
154 }
155 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
156 if(list!=NULL){
157 list[0] = UNICODESET_HIGH;
158 } else { // If memory allocation failed, set to bogus state.
159 setToBogus();
160 return;
161 }
162 _dbgct(this);
163 }
164
165 /**
166 * Constructs a set containing the given range. If <code>end >
167 * start</code> then an empty set is created.
168 *
169 * @param start first character, inclusive, of range
170 * @param end last character, inclusive, of range
171 */
UnicodeSet(UChar32 start,UChar32 end)172 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) :
173 len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0),
174 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
175 fFlags(0)
176 {
177 UErrorCode status = U_ZERO_ERROR;
178 allocateStrings(status);
179 if (U_FAILURE(status)) {
180 return;
181 }
182 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
183 if(list!=NULL){
184 list[0] = UNICODESET_HIGH;
185 complement(start, end);
186 } else { // If memory allocation failed, set to bogus state.
187 setToBogus();
188 return;
189 }
190 _dbgct(this);
191 }
192
193 /**
194 * Constructs a set that is identical to the given UnicodeSet.
195 */
UnicodeSet(const UnicodeSet & o)196 UnicodeSet::UnicodeSet(const UnicodeSet& o) :
197 UnicodeFilter(o),
198 len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0),
199 bmpSet(0),
200 buffer(0), bufferCapacity(0),
201 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
202 fFlags(0)
203 {
204 UErrorCode status = U_ZERO_ERROR;
205 allocateStrings(status);
206 if (U_FAILURE(status)) {
207 return;
208 }
209 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
210 if(list!=NULL){
211 *this = o;
212 } else { // If memory allocation failed, set to bogus state.
213 setToBogus();
214 return;
215 }
216 _dbgct(this);
217 }
218
219 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)220 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) :
221 UnicodeFilter(o),
222 len(0), capacity(o.len + GROW_EXTRA), list(0),
223 bmpSet(0),
224 buffer(0), bufferCapacity(0),
225 patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
226 fFlags(0)
227 {
228 UErrorCode status = U_ZERO_ERROR;
229 allocateStrings(status);
230 if (U_FAILURE(status)) {
231 return;
232 }
233 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
234 if(list!=NULL){
235 // *this = o except for bmpSet and stringSpan
236 len = o.len;
237 uprv_memcpy(list, o.list, len*sizeof(UChar32));
238 if (strings != NULL && o.strings != NULL) {
239 strings->assign(*o.strings, cloneUnicodeString, status);
240 } else { // Invalid strings.
241 setToBogus();
242 return;
243 }
244 if (o.pat) {
245 setPattern(UnicodeString(o.pat, o.patLen));
246 }
247 } else { // If memory allocation failed, set to bogus state.
248 setToBogus();
249 return;
250 }
251 _dbgct(this);
252 }
253
254 /**
255 * Destructs the set.
256 */
~UnicodeSet()257 UnicodeSet::~UnicodeSet() {
258 _dbgdt(this); // first!
259 uprv_free(list);
260 delete bmpSet;
261 if (buffer) {
262 uprv_free(buffer);
263 }
264 delete strings;
265 delete stringSpan;
266 releasePattern();
267 }
268
269 /**
270 * Assigns this object to be a copy of another.
271 */
operator =(const UnicodeSet & o)272 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
273 if (this == &o) {
274 return *this;
275 }
276 if (isFrozen()) {
277 return *this;
278 }
279 if (o.isBogus()) {
280 setToBogus();
281 return *this;
282 }
283 UErrorCode ec = U_ZERO_ERROR;
284 ensureCapacity(o.len, ec);
285 if (U_FAILURE(ec)) {
286 return *this; // There is no way to report this error :-(
287 }
288 len = o.len;
289 uprv_memcpy(list, o.list, len*sizeof(UChar32));
290 if (o.bmpSet == NULL) {
291 bmpSet = NULL;
292 } else {
293 bmpSet = new BMPSet(*o.bmpSet, list, len);
294 if (bmpSet == NULL) { // Check for memory allocation error.
295 setToBogus();
296 return *this;
297 }
298 }
299 if (strings != NULL && o.strings != NULL) {
300 strings->assign(*o.strings, cloneUnicodeString, ec);
301 } else { // Invalid strings.
302 setToBogus();
303 return *this;
304 }
305 if (o.stringSpan == NULL) {
306 stringSpan = NULL;
307 } else {
308 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
309 if (stringSpan == NULL) { // Check for memory allocation error.
310 setToBogus();
311 return *this;
312 }
313 }
314 releasePattern();
315 if (o.pat) {
316 setPattern(UnicodeString(o.pat, o.patLen));
317 }
318 return *this;
319 }
320
321 /**
322 * Returns a copy of this object. All UnicodeMatcher objects have
323 * to support cloning in order to allow classes using
324 * UnicodeMatchers, such as Transliterator, to implement cloning.
325 */
clone() const326 UnicodeFunctor* UnicodeSet::clone() const {
327 return new UnicodeSet(*this);
328 }
329
cloneAsThawed() const330 UnicodeFunctor *UnicodeSet::cloneAsThawed() const {
331 return new UnicodeSet(*this, TRUE);
332 }
333
334 /**
335 * Compares the specified object with this set for equality. Returns
336 * <tt>true</tt> if the two sets
337 * have the same size, and every member of the specified set is
338 * contained in this set (or equivalently, every member of this set is
339 * contained in the specified set).
340 *
341 * @param o set to be compared for equality with this set.
342 * @return <tt>true</tt> if the specified set is equal to this set.
343 */
operator ==(const UnicodeSet & o) const344 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
345 if (len != o.len) return FALSE;
346 for (int32_t i = 0; i < len; ++i) {
347 if (list[i] != o.list[i]) return FALSE;
348 }
349 if (*strings != *o.strings) return FALSE;
350 return TRUE;
351 }
352
353 /**
354 * Returns the hash code value for this set.
355 *
356 * @return the hash code value for this set.
357 * @see Object#hashCode()
358 */
hashCode(void) const359 int32_t UnicodeSet::hashCode(void) const {
360 int32_t result = len;
361 for (int32_t i = 0; i < len; ++i) {
362 result *= 1000003;
363 result += list[i];
364 }
365 return result;
366 }
367
368 //----------------------------------------------------------------
369 // Public API
370 //----------------------------------------------------------------
371
372 /**
373 * Returns the number of elements in this set (its cardinality),
374 * Note than the elements of a set may include both individual
375 * codepoints and strings.
376 *
377 * @return the number of elements in this set (its cardinality).
378 */
size(void) const379 int32_t UnicodeSet::size(void) const {
380 int32_t n = 0;
381 int32_t count = getRangeCount();
382 for (int32_t i = 0; i < count; ++i) {
383 n += getRangeEnd(i) - getRangeStart(i) + 1;
384 }
385 return n + strings->size();
386 }
387
388 /**
389 * Returns <tt>true</tt> if this set contains no elements.
390 *
391 * @return <tt>true</tt> if this set contains no elements.
392 */
isEmpty(void) const393 UBool UnicodeSet::isEmpty(void) const {
394 return len == 1 && strings->size() == 0;
395 }
396
397 /**
398 * Returns true if this set contains the given character.
399 * @param c character to be checked for containment
400 * @return true if the test condition is met
401 */
contains(UChar32 c) const402 UBool UnicodeSet::contains(UChar32 c) const {
403 // Set i to the index of the start item greater than ch
404 // We know we will terminate without length test!
405 // LATER: for large sets, add binary search
406 //int32_t i = -1;
407 //for (;;) {
408 // if (c < list[++i]) break;
409 //}
410 if (bmpSet != NULL) {
411 return bmpSet->contains(c);
412 }
413 if (stringSpan != NULL) {
414 return stringSpan->contains(c);
415 }
416 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
417 return FALSE;
418 }
419 int32_t i = findCodePoint(c);
420 return (UBool)(i & 1); // return true if odd
421 }
422
423 /**
424 * Returns the smallest value i such that c < list[i]. Caller
425 * must ensure that c is a legal value or this method will enter
426 * an infinite loop. This method performs a binary search.
427 * @param c a character in the range MIN_VALUE..MAX_VALUE
428 * inclusive
429 * @return the smallest integer i in the range 0..len-1,
430 * inclusive, such that c < list[i]
431 */
findCodePoint(UChar32 c) const432 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
433 /* Examples:
434 findCodePoint(c)
435 set list[] c=0 1 3 4 7 8
436 === ============== ===========
437 [] [110000] 0 0 0 0 0 0
438 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
439 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
440 [:Any:] [0, 110000] 1 1 1 1 1 1
441 */
442
443 // Return the smallest i such that c < list[i]. Assume
444 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
445 if (c < list[0])
446 return 0;
447 // High runner test. c is often after the last range, so an
448 // initial check for this condition pays off.
449 int32_t lo = 0;
450 int32_t hi = len - 1;
451 if (lo >= hi || c >= list[hi-1])
452 return hi;
453 // invariant: c >= list[lo]
454 // invariant: c < list[hi]
455 for (;;) {
456 int32_t i = (lo + hi) >> 1;
457 if (i == lo) {
458 break; // Found!
459 } else if (c < list[i]) {
460 hi = i;
461 } else {
462 lo = i;
463 }
464 }
465 return hi;
466 }
467
468 /**
469 * Returns true if this set contains every character
470 * of the given range.
471 * @param start first character, inclusive, of the range
472 * @param end last character, inclusive, of the range
473 * @return true if the test condition is met
474 */
contains(UChar32 start,UChar32 end) const475 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
476 //int32_t i = -1;
477 //for (;;) {
478 // if (start < list[++i]) break;
479 //}
480 int32_t i = findCodePoint(start);
481 return ((i & 1) != 0 && end < list[i]);
482 }
483
484 /**
485 * Returns <tt>true</tt> if this set contains the given
486 * multicharacter string.
487 * @param s string to be checked for containment
488 * @return <tt>true</tt> if this set contains the specified string
489 */
contains(const UnicodeString & s) const490 UBool UnicodeSet::contains(const UnicodeString& s) const {
491 if (s.length() == 0) return FALSE;
492 int32_t cp = getSingleCP(s);
493 if (cp < 0) {
494 return strings->contains((void*) &s);
495 } else {
496 return contains((UChar32) cp);
497 }
498 }
499
500 /**
501 * Returns true if this set contains all the characters and strings
502 * of the given set.
503 * @param c set to be checked for containment
504 * @return true if the test condition is met
505 */
containsAll(const UnicodeSet & c) const506 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
507 // The specified set is a subset if all of its pairs are contained in
508 // this set. It's possible to code this more efficiently in terms of
509 // direct manipulation of the inversion lists if the need arises.
510 int32_t n = c.getRangeCount();
511 for (int i=0; i<n; ++i) {
512 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
513 return FALSE;
514 }
515 }
516 if (!strings->containsAll(*c.strings)) return FALSE;
517 return TRUE;
518 }
519
520 /**
521 * Returns true if this set contains all the characters
522 * of the given string.
523 * @param s string containing characters to be checked for containment
524 * @return true if the test condition is met
525 */
containsAll(const UnicodeString & s) const526 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
527 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
528 s.length());
529 }
530
531 /**
532 * Returns true if this set contains none of the characters
533 * of the given range.
534 * @param start first character, inclusive, of the range
535 * @param end last character, inclusive, of the range
536 * @return true if the test condition is met
537 */
containsNone(UChar32 start,UChar32 end) const538 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
539 //int32_t i = -1;
540 //for (;;) {
541 // if (start < list[++i]) break;
542 //}
543 int32_t i = findCodePoint(start);
544 return ((i & 1) == 0 && end < list[i]);
545 }
546
547 /**
548 * Returns true if this set contains none of the characters and strings
549 * of the given set.
550 * @param c set to be checked for containment
551 * @return true if the test condition is met
552 */
containsNone(const UnicodeSet & c) const553 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
554 // The specified set is a subset if all of its pairs are contained in
555 // this set. It's possible to code this more efficiently in terms of
556 // direct manipulation of the inversion lists if the need arises.
557 int32_t n = c.getRangeCount();
558 for (int32_t i=0; i<n; ++i) {
559 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
560 return FALSE;
561 }
562 }
563 if (!strings->containsNone(*c.strings)) return FALSE;
564 return TRUE;
565 }
566
567 /**
568 * Returns true if this set contains none of the characters
569 * of the given string.
570 * @param s string containing characters to be checked for containment
571 * @return true if the test condition is met
572 */
containsNone(const UnicodeString & s) const573 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
574 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
575 s.length());
576 }
577
578 /**
579 * Returns <tt>true</tt> if this set contains any character whose low byte
580 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
581 * indexing.
582 */
matchesIndexValue(uint8_t v) const583 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
584 /* The index value v, in the range [0,255], is contained in this set if
585 * it is contained in any pair of this set. Pairs either have the high
586 * bytes equal, or unequal. If the high bytes are equal, then we have
587 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
588 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
589 * Then v is contained if xx <= v || v <= yy. (This is identical to the
590 * time zone month containment logic.)
591 */
592 int32_t i;
593 int32_t rangeCount=getRangeCount();
594 for (i=0; i<rangeCount; ++i) {
595 UChar32 low = getRangeStart(i);
596 UChar32 high = getRangeEnd(i);
597 if ((low & ~0xFF) == (high & ~0xFF)) {
598 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
599 return TRUE;
600 }
601 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
602 return TRUE;
603 }
604 }
605 if (strings->size() != 0) {
606 for (i=0; i<strings->size(); ++i) {
607 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
608 //if (s.length() == 0) {
609 // // Empty strings match everything
610 // return TRUE;
611 //}
612 // assert(s.length() != 0); // We enforce this elsewhere
613 UChar32 c = s.char32At(0);
614 if ((c & 0xFF) == v) {
615 return TRUE;
616 }
617 }
618 }
619 return FALSE;
620 }
621
622 /**
623 * Implementation of UnicodeMatcher::matches(). Always matches the
624 * longest possible multichar string.
625 */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)626 UMatchDegree UnicodeSet::matches(const Replaceable& text,
627 int32_t& offset,
628 int32_t limit,
629 UBool incremental) {
630 if (offset == limit) {
631 // Strings, if any, have length != 0, so we don't worry
632 // about them here. If we ever allow zero-length strings
633 // we much check for them here.
634 if (contains(U_ETHER)) {
635 return incremental ? U_PARTIAL_MATCH : U_MATCH;
636 } else {
637 return U_MISMATCH;
638 }
639 } else {
640 if (strings->size() != 0) { // try strings first
641
642 // might separate forward and backward loops later
643 // for now they are combined
644
645 // TODO Improve efficiency of this, at least in the forward
646 // direction, if not in both. In the forward direction we
647 // can assume the strings are sorted.
648
649 int32_t i;
650 UBool forward = offset < limit;
651
652 // firstChar is the leftmost char to match in the
653 // forward direction or the rightmost char to match in
654 // the reverse direction.
655 UChar firstChar = text.charAt(offset);
656
657 // If there are multiple strings that can match we
658 // return the longest match.
659 int32_t highWaterLength = 0;
660
661 for (i=0; i<strings->size(); ++i) {
662 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
663
664 //if (trial.length() == 0) {
665 // return U_MATCH; // null-string always matches
666 //}
667 // assert(trial.length() != 0); // We ensure this elsewhere
668
669 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
670
671 // Strings are sorted, so we can optimize in the
672 // forward direction.
673 if (forward && c > firstChar) break;
674 if (c != firstChar) continue;
675
676 int32_t matchLen = matchRest(text, offset, limit, trial);
677
678 if (incremental) {
679 int32_t maxLen = forward ? limit-offset : offset-limit;
680 if (matchLen == maxLen) {
681 // We have successfully matched but only up to limit.
682 return U_PARTIAL_MATCH;
683 }
684 }
685
686 if (matchLen == trial.length()) {
687 // We have successfully matched the whole string.
688 if (matchLen > highWaterLength) {
689 highWaterLength = matchLen;
690 }
691 // In the forward direction we know strings
692 // are sorted so we can bail early.
693 if (forward && matchLen < highWaterLength) {
694 break;
695 }
696 continue;
697 }
698 }
699
700 // We've checked all strings without a partial match.
701 // If we have full matches, return the longest one.
702 if (highWaterLength != 0) {
703 offset += forward ? highWaterLength : -highWaterLength;
704 return U_MATCH;
705 }
706 }
707 return UnicodeFilter::matches(text, offset, limit, incremental);
708 }
709 }
710
711 /**
712 * Returns the longest match for s in text at the given position.
713 * If limit > start then match forward from start+1 to limit
714 * matching all characters except s.charAt(0). If limit < start,
715 * go backward starting from start-1 matching all characters
716 * except s.charAt(s.length()-1). This method assumes that the
717 * first character, text.charAt(start), matches s, so it does not
718 * check it.
719 * @param text the text to match
720 * @param start the first character to match. In the forward
721 * direction, text.charAt(start) is matched against s.charAt(0).
722 * In the reverse direction, it is matched against
723 * s.charAt(s.length()-1).
724 * @param limit the limit offset for matching, either last+1 in
725 * the forward direction, or last-1 in the reverse direction,
726 * where last is the index of the last character to match.
727 * @return If part of s matches up to the limit, return |limit -
728 * start|. If all of s matches before reaching the limit, return
729 * s.length(). If there is a mismatch between s and text, return
730 * 0
731 */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)732 int32_t UnicodeSet::matchRest(const Replaceable& text,
733 int32_t start, int32_t limit,
734 const UnicodeString& s) {
735 int32_t i;
736 int32_t maxLen;
737 int32_t slen = s.length();
738 if (start < limit) {
739 maxLen = limit - start;
740 if (maxLen > slen) maxLen = slen;
741 for (i = 1; i < maxLen; ++i) {
742 if (text.charAt(start + i) != s.charAt(i)) return 0;
743 }
744 } else {
745 maxLen = start - limit;
746 if (maxLen > slen) maxLen = slen;
747 --slen; // <=> slen = s.length() - 1;
748 for (i = 1; i < maxLen; ++i) {
749 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
750 }
751 }
752 return maxLen;
753 }
754
755 /**
756 * Implement of UnicodeMatcher
757 */
addMatchSetTo(UnicodeSet & toUnionTo) const758 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
759 toUnionTo.addAll(*this);
760 }
761
762 /**
763 * Returns the index of the given character within this set, where
764 * the set is ordered by ascending code point. If the character
765 * is not in this set, return -1. The inverse of this method is
766 * <code>charAt()</code>.
767 * @return an index from 0..size()-1, or -1
768 */
indexOf(UChar32 c) const769 int32_t UnicodeSet::indexOf(UChar32 c) const {
770 if (c < MIN_VALUE || c > MAX_VALUE) {
771 return -1;
772 }
773 int32_t i = 0;
774 int32_t n = 0;
775 for (;;) {
776 UChar32 start = list[i++];
777 if (c < start) {
778 return -1;
779 }
780 UChar32 limit = list[i++];
781 if (c < limit) {
782 return n + c - start;
783 }
784 n += limit - start;
785 }
786 }
787
788 /**
789 * Returns the character at the given index within this set, where
790 * the set is ordered by ascending code point. If the index is
791 * out of range, return (UChar32)-1. The inverse of this method is
792 * <code>indexOf()</code>.
793 * @param index an index from 0..size()-1
794 * @return the character at the given index, or (UChar32)-1.
795 */
charAt(int32_t index) const796 UChar32 UnicodeSet::charAt(int32_t index) const {
797 if (index >= 0) {
798 // len2 is the largest even integer <= len, that is, it is len
799 // for even values and len-1 for odd values. With odd values
800 // the last entry is UNICODESET_HIGH.
801 int32_t len2 = len & ~1;
802 for (int32_t i=0; i < len2;) {
803 UChar32 start = list[i++];
804 int32_t count = list[i++] - start;
805 if (index < count) {
806 return (UChar32)(start + index);
807 }
808 index -= count;
809 }
810 }
811 return (UChar32)-1;
812 }
813
814 /**
815 * Make this object represent the range <code>start - end</code>.
816 * If <code>end > start</code> then this object is set to an
817 * an empty range.
818 *
819 * @param start first character in the set, inclusive
820 * @rparam end last character in the set, inclusive
821 */
set(UChar32 start,UChar32 end)822 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
823 clear();
824 complement(start, end);
825 return *this;
826 }
827
828 /**
829 * Adds the specified range to this set if it is not already
830 * present. If this set already contains the specified range,
831 * the call leaves this set unchanged. If <code>end > start</code>
832 * then an empty range is added, leaving the set unchanged.
833 *
834 * @param start first character, inclusive, of range to be added
835 * to this set.
836 * @param end last character, inclusive, of range to be added
837 * to this set.
838 */
add(UChar32 start,UChar32 end)839 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
840 if (pinCodePoint(start) < pinCodePoint(end)) {
841 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
842 add(range, 2, 0);
843 } else if (start == end) {
844 add(start);
845 }
846 return *this;
847 }
848
849 // #define DEBUG_US_ADD
850
851 #ifdef DEBUG_US_ADD
852 #include <stdio.h>
dump(UChar32 c)853 void dump(UChar32 c) {
854 if (c <= 0xFF) {
855 printf("%c", (char)c);
856 } else {
857 printf("U+%04X", c);
858 }
859 }
dump(const UChar32 * list,int32_t len)860 void dump(const UChar32* list, int32_t len) {
861 printf("[");
862 for (int32_t i=0; i<len; ++i) {
863 if (i != 0) printf(", ");
864 dump(list[i]);
865 }
866 printf("]");
867 }
868 #endif
869
870 /**
871 * Adds the specified character to this set if it is not already
872 * present. If this set already contains the specified character,
873 * the call leaves this set unchanged.
874 */
add(UChar32 c)875 UnicodeSet& UnicodeSet::add(UChar32 c) {
876 // find smallest i such that c < list[i]
877 // if odd, then it is IN the set
878 // if even, then it is OUT of the set
879 int32_t i = findCodePoint(pinCodePoint(c));
880
881 // already in set?
882 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
883
884 // HIGH is 0x110000
885 // assert(list[len-1] == HIGH);
886
887 // empty = [HIGH]
888 // [start_0, limit_0, start_1, limit_1, HIGH]
889
890 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
891 // ^
892 // list[i]
893
894 // i == 0 means c is before the first range
895
896 #ifdef DEBUG_US_ADD
897 printf("Add of ");
898 dump(c);
899 printf(" found at %d", i);
900 printf(": ");
901 dump(list, len);
902 printf(" => ");
903 #endif
904
905 if (c == list[i]-1) {
906 // c is before start of next range
907 list[i] = c;
908 // if we touched the HIGH mark, then add a new one
909 if (c == (UNICODESET_HIGH - 1)) {
910 UErrorCode status = U_ZERO_ERROR;
911 ensureCapacity(len+1, status);
912 if (U_FAILURE(status)) {
913 return *this; // There is no way to report this error :-(
914 }
915 list[len++] = UNICODESET_HIGH;
916 }
917 if (i > 0 && c == list[i-1]) {
918 // collapse adjacent ranges
919
920 // [..., start_k-1, c, c, limit_k, ..., HIGH]
921 // ^
922 // list[i]
923
924 //for (int32_t k=i-1; k<len-2; ++k) {
925 // list[k] = list[k+2];
926 //}
927 UChar32* dst = list + i - 1;
928 UChar32* src = dst + 2;
929 UChar32* srclimit = list + len;
930 while (src < srclimit) *(dst++) = *(src++);
931
932 len -= 2;
933 }
934 }
935
936 else if (i > 0 && c == list[i-1]) {
937 // c is after end of prior range
938 list[i-1]++;
939 // no need to check for collapse here
940 }
941
942 else {
943 // At this point we know the new char is not adjacent to
944 // any existing ranges, and it is not 10FFFF.
945
946
947 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
948 // ^
949 // list[i]
950
951 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
952 // ^
953 // list[i]
954
955 UErrorCode status = U_ZERO_ERROR;
956 ensureCapacity(len+2, status);
957 if (U_FAILURE(status)) {
958 return *this; // There is no way to report this error :-(
959 }
960
961 //for (int32_t k=len-1; k>=i; --k) {
962 // list[k+2] = list[k];
963 //}
964 UChar32* src = list + len;
965 UChar32* dst = src + 2;
966 UChar32* srclimit = list + i;
967 while (src > srclimit) *(--dst) = *(--src);
968
969 list[i] = c;
970 list[i+1] = c+1;
971 len += 2;
972 }
973
974 #ifdef DEBUG_US_ADD
975 dump(list, len);
976 printf("\n");
977
978 for (i=1; i<len; ++i) {
979 if (list[i] <= list[i-1]) {
980 // Corrupt array!
981 printf("ERROR: list has been corrupted\n");
982 exit(1);
983 }
984 }
985 #endif
986
987 releasePattern();
988 return *this;
989 }
990
991 /**
992 * Adds the specified multicharacter to this set if it is not already
993 * present. If this set already contains the multicharacter,
994 * the call leaves this set unchanged.
995 * Thus "ch" => {"ch"}
996 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
997 * @param s the source string
998 * @return the modified set, for chaining
999 */
add(const UnicodeString & s)1000 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
1001 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1002 int32_t cp = getSingleCP(s);
1003 if (cp < 0) {
1004 if (!strings->contains((void*) &s)) {
1005 _add(s);
1006 releasePattern();
1007 }
1008 } else {
1009 add((UChar32)cp);
1010 }
1011 return *this;
1012 }
1013
1014 /**
1015 * Adds the given string, in order, to 'strings'. The given string
1016 * must have been checked by the caller to not be empty and to not
1017 * already be in 'strings'.
1018 */
_add(const UnicodeString & s)1019 void UnicodeSet::_add(const UnicodeString& s) {
1020 if (isFrozen() || isBogus()) {
1021 return;
1022 }
1023 UnicodeString* t = new UnicodeString(s);
1024 if (t == NULL) { // Check for memory allocation error.
1025 setToBogus();
1026 return;
1027 }
1028 UErrorCode ec = U_ZERO_ERROR;
1029 strings->sortedInsert(t, compareUnicodeString, ec);
1030 if (U_FAILURE(ec)) {
1031 setToBogus();
1032 delete t;
1033 }
1034 }
1035
1036 /**
1037 * @return a code point IF the string consists of a single one.
1038 * otherwise returns -1.
1039 * @param string to test
1040 */
getSingleCP(const UnicodeString & s)1041 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1042 //if (s.length() < 1) {
1043 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1044 //}
1045 if (s.length() > 2) return -1;
1046 if (s.length() == 1) return s.charAt(0);
1047
1048 // at this point, len = 2
1049 UChar32 cp = s.char32At(0);
1050 if (cp > 0xFFFF) { // is surrogate pair
1051 return cp;
1052 }
1053 return -1;
1054 }
1055
1056 /**
1057 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1058 * If this set already any particular character, it has no effect on that character.
1059 * @param the source string
1060 * @return the modified set, for chaining
1061 */
addAll(const UnicodeString & s)1062 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1063 UChar32 cp;
1064 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1065 cp = s.char32At(i);
1066 add(cp);
1067 }
1068 return *this;
1069 }
1070
1071 /**
1072 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1073 * If this set already any particular character, it has no effect on that character.
1074 * @param the source string
1075 * @return the modified set, for chaining
1076 */
retainAll(const UnicodeString & s)1077 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1078 UnicodeSet set;
1079 set.addAll(s);
1080 retainAll(set);
1081 return *this;
1082 }
1083
1084 /**
1085 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1086 * If this set already any particular character, it has no effect on that character.
1087 * @param the source string
1088 * @return the modified set, for chaining
1089 */
complementAll(const UnicodeString & s)1090 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1091 UnicodeSet set;
1092 set.addAll(s);
1093 complementAll(set);
1094 return *this;
1095 }
1096
1097 /**
1098 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1099 * If this set already any particular character, it has no effect on that character.
1100 * @param the source string
1101 * @return the modified set, for chaining
1102 */
removeAll(const UnicodeString & s)1103 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1104 UnicodeSet set;
1105 set.addAll(s);
1106 removeAll(set);
1107 return *this;
1108 }
1109
removeAllStrings()1110 UnicodeSet& UnicodeSet::removeAllStrings() {
1111 strings->removeAllElements();
1112 return *this;
1113 }
1114
1115
1116 /**
1117 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1118 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1119 * @param the source string
1120 * @return a newly created set containing the given string
1121 */
createFrom(const UnicodeString & s)1122 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1123 UnicodeSet *set = new UnicodeSet();
1124 if (set != NULL) { // Check for memory allocation error.
1125 set->add(s);
1126 }
1127 return set;
1128 }
1129
1130
1131 /**
1132 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1133 * @param the source string
1134 * @return a newly created set containing the given characters
1135 */
createFromAll(const UnicodeString & s)1136 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1137 UnicodeSet *set = new UnicodeSet();
1138 if (set != NULL) { // Check for memory allocation error.
1139 set->addAll(s);
1140 }
1141 return set;
1142 }
1143
1144 /**
1145 * Retain only the elements in this set that are contained in the
1146 * specified range. If <code>end > start</code> then an empty range is
1147 * retained, leaving the set empty.
1148 *
1149 * @param start first character, inclusive, of range to be retained
1150 * to this set.
1151 * @param end last character, inclusive, of range to be retained
1152 * to this set.
1153 */
retain(UChar32 start,UChar32 end)1154 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1155 if (pinCodePoint(start) <= pinCodePoint(end)) {
1156 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1157 retain(range, 2, 0);
1158 } else {
1159 clear();
1160 }
1161 return *this;
1162 }
1163
retain(UChar32 c)1164 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1165 return retain(c, c);
1166 }
1167
1168 /**
1169 * Removes the specified range from this set if it is present.
1170 * The set will not contain the specified range once the call
1171 * returns. If <code>end > start</code> then an empty range is
1172 * removed, leaving the set unchanged.
1173 *
1174 * @param start first character, inclusive, of range to be removed
1175 * from this set.
1176 * @param end last character, inclusive, of range to be removed
1177 * from this set.
1178 */
remove(UChar32 start,UChar32 end)1179 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1180 if (pinCodePoint(start) <= pinCodePoint(end)) {
1181 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1182 retain(range, 2, 2);
1183 }
1184 return *this;
1185 }
1186
1187 /**
1188 * Removes the specified character from this set if it is present.
1189 * The set will not contain the specified range once the call
1190 * returns.
1191 */
remove(UChar32 c)1192 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1193 return remove(c, c);
1194 }
1195
1196 /**
1197 * Removes the specified string from this set if it is present.
1198 * The set will not contain the specified character once the call
1199 * returns.
1200 * @param the source string
1201 * @return the modified set, for chaining
1202 */
remove(const UnicodeString & s)1203 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1204 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1205 int32_t cp = getSingleCP(s);
1206 if (cp < 0) {
1207 strings->removeElement((void*) &s);
1208 releasePattern();
1209 } else {
1210 remove((UChar32)cp, (UChar32)cp);
1211 }
1212 return *this;
1213 }
1214
1215 /**
1216 * Complements the specified range in this set. Any character in
1217 * the range will be removed if it is in this set, or will be
1218 * added if it is not in this set. If <code>end > start</code>
1219 * then an empty range is xor'ed, leaving the set unchanged.
1220 *
1221 * @param start first character, inclusive, of range to be removed
1222 * from this set.
1223 * @param end last character, inclusive, of range to be removed
1224 * from this set.
1225 */
complement(UChar32 start,UChar32 end)1226 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1227 if (isFrozen() || isBogus()) {
1228 return *this;
1229 }
1230 if (pinCodePoint(start) <= pinCodePoint(end)) {
1231 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1232 exclusiveOr(range, 2, 0);
1233 }
1234 releasePattern();
1235 return *this;
1236 }
1237
complement(UChar32 c)1238 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1239 return complement(c, c);
1240 }
1241
1242 /**
1243 * This is equivalent to
1244 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1245 */
complement(void)1246 UnicodeSet& UnicodeSet::complement(void) {
1247 if (isFrozen() || isBogus()) {
1248 return *this;
1249 }
1250 UErrorCode status = U_ZERO_ERROR;
1251 if (list[0] == UNICODESET_LOW) {
1252 ensureBufferCapacity(len-1, status);
1253 if (U_FAILURE(status)) {
1254 return *this;
1255 }
1256 uprv_memcpy(buffer, list + 1, (len-1)*sizeof(UChar32));
1257 --len;
1258 } else {
1259 ensureBufferCapacity(len+1, status);
1260 if (U_FAILURE(status)) {
1261 return *this;
1262 }
1263 uprv_memcpy(buffer + 1, list, len*sizeof(UChar32));
1264 buffer[0] = UNICODESET_LOW;
1265 ++len;
1266 }
1267 swapBuffers();
1268 releasePattern();
1269 return *this;
1270 }
1271
1272 /**
1273 * Complement the specified string in this set.
1274 * The set will not contain the specified string once the call
1275 * returns.
1276 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1277 * @param s the string to complement
1278 * @return this object, for chaining
1279 */
complement(const UnicodeString & s)1280 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1281 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1282 int32_t cp = getSingleCP(s);
1283 if (cp < 0) {
1284 if (strings->contains((void*) &s)) {
1285 strings->removeElement((void*) &s);
1286 } else {
1287 _add(s);
1288 }
1289 releasePattern();
1290 } else {
1291 complement((UChar32)cp, (UChar32)cp);
1292 }
1293 return *this;
1294 }
1295
1296 /**
1297 * Adds all of the elements in the specified set to this set if
1298 * they're not already present. This operation effectively
1299 * modifies this set so that its value is the <i>union</i> of the two
1300 * sets. The behavior of this operation is unspecified if the specified
1301 * collection is modified while the operation is in progress.
1302 *
1303 * @param c set whose elements are to be added to this set.
1304 * @see #add(char, char)
1305 */
addAll(const UnicodeSet & c)1306 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1307 if ( c.len>0 && c.list!=NULL ) {
1308 add(c.list, c.len, 0);
1309 }
1310
1311 // Add strings in order
1312 if ( c.strings!=NULL ) {
1313 for (int32_t i=0; i<c.strings->size(); ++i) {
1314 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1315 if (!strings->contains((void*) s)) {
1316 _add(*s);
1317 }
1318 }
1319 }
1320 return *this;
1321 }
1322
1323 /**
1324 * Retains only the elements in this set that are contained in the
1325 * specified set. In other words, removes from this set all of
1326 * its elements that are not contained in the specified set. This
1327 * operation effectively modifies this set so that its value is
1328 * the <i>intersection</i> of the two sets.
1329 *
1330 * @param c set that defines which elements this set will retain.
1331 */
retainAll(const UnicodeSet & c)1332 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1333 if (isFrozen() || isBogus()) {
1334 return *this;
1335 }
1336 retain(c.list, c.len, 0);
1337 strings->retainAll(*c.strings);
1338 return *this;
1339 }
1340
1341 /**
1342 * Removes from this set all of its elements that are contained in the
1343 * specified set. This operation effectively modifies this
1344 * set so that its value is the <i>asymmetric set difference</i> of
1345 * the two sets.
1346 *
1347 * @param c set that defines which elements will be removed from
1348 * this set.
1349 */
removeAll(const UnicodeSet & c)1350 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1351 if (isFrozen() || isBogus()) {
1352 return *this;
1353 }
1354 retain(c.list, c.len, 2);
1355 strings->removeAll(*c.strings);
1356 return *this;
1357 }
1358
1359 /**
1360 * Complements in this set all elements contained in the specified
1361 * set. Any character in the other set will be removed if it is
1362 * in this set, or will be added if it is not in this set.
1363 *
1364 * @param c set that defines which elements will be xor'ed from
1365 * this set.
1366 */
complementAll(const UnicodeSet & c)1367 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1368 if (isFrozen() || isBogus()) {
1369 return *this;
1370 }
1371 exclusiveOr(c.list, c.len, 0);
1372
1373 for (int32_t i=0; i<c.strings->size(); ++i) {
1374 void* e = c.strings->elementAt(i);
1375 if (!strings->removeElement(e)) {
1376 _add(*(const UnicodeString*)e);
1377 }
1378 }
1379 return *this;
1380 }
1381
1382 /**
1383 * Removes all of the elements from this set. This set will be
1384 * empty after this call returns.
1385 */
clear(void)1386 UnicodeSet& UnicodeSet::clear(void) {
1387 if (isFrozen()) {
1388 return *this;
1389 }
1390 if (list != NULL) {
1391 list[0] = UNICODESET_HIGH;
1392 }
1393 len = 1;
1394 releasePattern();
1395 if (strings != NULL) {
1396 strings->removeAllElements();
1397 }
1398 if (list != NULL && strings != NULL) {
1399 // Remove bogus
1400 fFlags = 0;
1401 }
1402 return *this;
1403 }
1404
1405 /**
1406 * Iteration method that returns the number of ranges contained in
1407 * this set.
1408 * @see #getRangeStart
1409 * @see #getRangeEnd
1410 */
getRangeCount() const1411 int32_t UnicodeSet::getRangeCount() const {
1412 return len/2;
1413 }
1414
1415 /**
1416 * Iteration method that returns the first character in the
1417 * specified range of this set.
1418 * @see #getRangeCount
1419 * @see #getRangeEnd
1420 */
getRangeStart(int32_t index) const1421 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1422 return list[index*2];
1423 }
1424
1425 /**
1426 * Iteration method that returns the last character in the
1427 * specified range of this set.
1428 * @see #getRangeStart
1429 * @see #getRangeEnd
1430 */
getRangeEnd(int32_t index) const1431 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1432 return list[index*2 + 1] - 1;
1433 }
1434
getStringCount() const1435 int32_t UnicodeSet::getStringCount() const {
1436 return strings->size();
1437 }
1438
getString(int32_t index) const1439 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1440 return (const UnicodeString*) strings->elementAt(index);
1441 }
1442
1443 /**
1444 * Reallocate this objects internal structures to take up the least
1445 * possible space, without changing this object's value.
1446 */
compact()1447 UnicodeSet& UnicodeSet::compact() {
1448 if (isFrozen() || isBogus()) {
1449 return *this;
1450 }
1451 // Delete buffer first to defragment memory less.
1452 if (buffer != NULL) {
1453 uprv_free(buffer);
1454 buffer = NULL;
1455 }
1456 if (len < capacity) {
1457 // Make the capacity equal to len or 1.
1458 // We don't want to realloc of 0 size.
1459 int32_t newCapacity = len + (len == 0);
1460 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity);
1461 if (temp) {
1462 list = temp;
1463 capacity = newCapacity;
1464 }
1465 // else what the heck happened?! We allocated less memory!
1466 // Oh well. We'll keep our original array.
1467 }
1468 return *this;
1469 }
1470
1471 #ifdef DEBUG_SERIALIZE
1472 #include <stdio.h>
1473 #endif
1474
1475 /**
1476 * Deserialize constructor.
1477 */
UnicodeSet(const uint16_t data[],int32_t dataLen,ESerialization serialization,UErrorCode & ec)1478 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec)
1479 : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0),
1480 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
1481 fFlags(0) {
1482
1483 if(U_FAILURE(ec)) {
1484 setToBogus();
1485 return;
1486 }
1487
1488 if( (serialization != kSerialized)
1489 || (data==NULL)
1490 || (dataLen < 1)) {
1491 ec = U_ILLEGAL_ARGUMENT_ERROR;
1492 setToBogus();
1493 return;
1494 }
1495
1496 allocateStrings(ec);
1497 if (U_FAILURE(ec)) {
1498 setToBogus();
1499 return;
1500 }
1501
1502 // bmp?
1503 int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1504 int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1505
1506 len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1507 #ifdef DEBUG_SERIALIZE
1508 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]);
1509 #endif
1510 capacity = len+1;
1511 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
1512 if(!list || U_FAILURE(ec)) {
1513 setToBogus();
1514 return;
1515 }
1516 // copy bmp
1517 int32_t i;
1518 for(i = 0; i< bmpLength;i++) {
1519 list[i] = data[i+headerSize];
1520 #ifdef DEBUG_SERIALIZE
1521 printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1522 #endif
1523 }
1524 // copy smp
1525 for(i=bmpLength;i<len;i++) {
1526 list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1527 ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1528 #ifdef DEBUG_SERIALIZE
1529 printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1530 #endif
1531 }
1532 // terminator
1533 list[len++]=UNICODESET_HIGH;
1534 }
1535
1536
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1537 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1538 int32_t bmpLength, length, destLength;
1539
1540 if (U_FAILURE(ec)) {
1541 return 0;
1542 }
1543
1544 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1545 ec=U_ILLEGAL_ARGUMENT_ERROR;
1546 return 0;
1547 }
1548
1549 /* count necessary 16-bit units */
1550 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1551 // assert(length>=0);
1552 if (length==0) {
1553 /* empty set */
1554 if (destCapacity>0) {
1555 *dest=0;
1556 } else {
1557 ec=U_BUFFER_OVERFLOW_ERROR;
1558 }
1559 return 1;
1560 }
1561 /* now length>0 */
1562
1563 if (this->list[length-1]<=0xffff) {
1564 /* all BMP */
1565 bmpLength=length;
1566 } else if (this->list[0]>=0x10000) {
1567 /* all supplementary */
1568 bmpLength=0;
1569 length*=2;
1570 } else {
1571 /* some BMP, some supplementary */
1572 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1573 length=bmpLength+2*(length-bmpLength);
1574 }
1575 #ifdef DEBUG_SERIALIZE
1576 printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1577 #endif
1578 /* length: number of 16-bit array units */
1579 if (length>0x7fff) {
1580 /* there are only 15 bits for the length in the first serialized word */
1581 ec=U_INDEX_OUTOFBOUNDS_ERROR;
1582 return 0;
1583 }
1584
1585 /*
1586 * total serialized length:
1587 * number of 16-bit array units (length) +
1588 * 1 length unit (always) +
1589 * 1 bmpLength unit (if there are supplementary values)
1590 */
1591 destLength=length+((length>bmpLength)?2:1);
1592 if (destLength<=destCapacity) {
1593 const UChar32 *p;
1594 int32_t i;
1595
1596 #ifdef DEBUG_SERIALIZE
1597 printf("writeHdr\n");
1598 #endif
1599 *dest=(uint16_t)length;
1600 if (length>bmpLength) {
1601 *dest|=0x8000;
1602 *++dest=(uint16_t)bmpLength;
1603 }
1604 ++dest;
1605
1606 /* write the BMP part of the array */
1607 p=this->list;
1608 for (i=0; i<bmpLength; ++i) {
1609 #ifdef DEBUG_SERIALIZE
1610 printf("writebmp: %x\n", (int)*p);
1611 #endif
1612 *dest++=(uint16_t)*p++;
1613 }
1614
1615 /* write the supplementary part of the array */
1616 for (; i<length; i+=2) {
1617 #ifdef DEBUG_SERIALIZE
1618 printf("write32: %x\n", (int)*p);
1619 #endif
1620 *dest++=(uint16_t)(*p>>16);
1621 *dest++=(uint16_t)*p++;
1622 }
1623 } else {
1624 ec=U_BUFFER_OVERFLOW_ERROR;
1625 }
1626 return destLength;
1627 }
1628
1629 //----------------------------------------------------------------
1630 // Implementation: Utility methods
1631 //----------------------------------------------------------------
1632
1633 /**
1634 * Allocate our strings vector and return TRUE if successful.
1635 */
allocateStrings(UErrorCode & status)1636 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1637 if (U_FAILURE(status)) {
1638 return FALSE;
1639 }
1640 strings = new UVector(uprv_deleteUObject,
1641 uhash_compareUnicodeString, 1, status);
1642 if (strings == NULL) { // Check for memory allocation error.
1643 status = U_MEMORY_ALLOCATION_ERROR;
1644 return FALSE;
1645 }
1646 if (U_FAILURE(status)) {
1647 delete strings;
1648 strings = NULL;
1649 return FALSE;
1650 }
1651 return TRUE;
1652 }
1653
ensureCapacity(int32_t newLen,UErrorCode & ec)1654 void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) {
1655 if (newLen <= capacity)
1656 return;
1657 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA));
1658 if (temp == NULL) {
1659 ec = U_MEMORY_ALLOCATION_ERROR;
1660 setToBogus();
1661 return;
1662 }
1663 list = temp;
1664 capacity = newLen + GROW_EXTRA;
1665 // else we keep the original contents on the memory failure.
1666 }
1667
ensureBufferCapacity(int32_t newLen,UErrorCode & ec)1668 void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) {
1669 if (buffer != NULL && newLen <= bufferCapacity)
1670 return;
1671 UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA));
1672 if (temp == NULL) {
1673 ec = U_MEMORY_ALLOCATION_ERROR;
1674 setToBogus();
1675 return;
1676 }
1677 buffer = temp;
1678 bufferCapacity = newLen + GROW_EXTRA;
1679 // else we keep the original contents on the memory failure.
1680 }
1681
1682 /**
1683 * Swap list and buffer.
1684 */
swapBuffers(void)1685 void UnicodeSet::swapBuffers(void) {
1686 // swap list and buffer
1687 UChar32* temp = list;
1688 list = buffer;
1689 buffer = temp;
1690
1691 int32_t c = capacity;
1692 capacity = bufferCapacity;
1693 bufferCapacity = c;
1694 }
1695
setToBogus()1696 void UnicodeSet::setToBogus() {
1697 clear(); // Remove everything in the set.
1698 fFlags = kIsBogus;
1699 }
1700
1701 //----------------------------------------------------------------
1702 // Implementation: Fundamental operators
1703 //----------------------------------------------------------------
1704
max(UChar32 a,UChar32 b)1705 static inline UChar32 max(UChar32 a, UChar32 b) {
1706 return (a > b) ? a : b;
1707 }
1708
1709 // polarity = 0, 3 is normal: x xor y
1710 // polarity = 1, 2: x xor ~y == x === y
1711
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1712 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1713 if (isFrozen() || isBogus()) {
1714 return;
1715 }
1716 UErrorCode status = U_ZERO_ERROR;
1717 ensureBufferCapacity(len + otherLen, status);
1718 if (U_FAILURE(status)) {
1719 return;
1720 }
1721
1722 int32_t i = 0, j = 0, k = 0;
1723 UChar32 a = list[i++];
1724 UChar32 b;
1725 if (polarity == 1 || polarity == 2) {
1726 b = UNICODESET_LOW;
1727 if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1728 ++j;
1729 b = other[j];
1730 }
1731 } else {
1732 b = other[j++];
1733 }
1734 // simplest of all the routines
1735 // sort the values, discarding identicals!
1736 for (;;) {
1737 if (a < b) {
1738 buffer[k++] = a;
1739 a = list[i++];
1740 } else if (b < a) {
1741 buffer[k++] = b;
1742 b = other[j++];
1743 } else if (a != UNICODESET_HIGH) { // at this point, a == b
1744 // discard both values!
1745 a = list[i++];
1746 b = other[j++];
1747 } else { // DONE!
1748 buffer[k++] = UNICODESET_HIGH;
1749 len = k;
1750 break;
1751 }
1752 }
1753 swapBuffers();
1754 releasePattern();
1755 }
1756
1757 // polarity = 0 is normal: x union y
1758 // polarity = 2: x union ~y
1759 // polarity = 1: ~x union y
1760 // polarity = 3: ~x union ~y
1761
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1762 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1763 if (isFrozen() || isBogus() || other==NULL) {
1764 return;
1765 }
1766 UErrorCode status = U_ZERO_ERROR;
1767 ensureBufferCapacity(len + otherLen, status);
1768 if (U_FAILURE(status)) {
1769 return;
1770 }
1771
1772 int32_t i = 0, j = 0, k = 0;
1773 UChar32 a = list[i++];
1774 UChar32 b = other[j++];
1775 // change from xor is that we have to check overlapping pairs
1776 // polarity bit 1 means a is second, bit 2 means b is.
1777 for (;;) {
1778 switch (polarity) {
1779 case 0: // both first; take lower if unequal
1780 if (a < b) { // take a
1781 // Back up over overlapping ranges in buffer[]
1782 if (k > 0 && a <= buffer[k-1]) {
1783 // Pick latter end value in buffer[] vs. list[]
1784 a = max(list[i], buffer[--k]);
1785 } else {
1786 // No overlap
1787 buffer[k++] = a;
1788 a = list[i];
1789 }
1790 i++; // Common if/else code factored out
1791 polarity ^= 1;
1792 } else if (b < a) { // take b
1793 if (k > 0 && b <= buffer[k-1]) {
1794 b = max(other[j], buffer[--k]);
1795 } else {
1796 buffer[k++] = b;
1797 b = other[j];
1798 }
1799 j++;
1800 polarity ^= 2;
1801 } else { // a == b, take a, drop b
1802 if (a == UNICODESET_HIGH) goto loop_end;
1803 // This is symmetrical; it doesn't matter if
1804 // we backtrack with a or b. - liu
1805 if (k > 0 && a <= buffer[k-1]) {
1806 a = max(list[i], buffer[--k]);
1807 } else {
1808 // No overlap
1809 buffer[k++] = a;
1810 a = list[i];
1811 }
1812 i++;
1813 polarity ^= 1;
1814 b = other[j++];
1815 polarity ^= 2;
1816 }
1817 break;
1818 case 3: // both second; take higher if unequal, and drop other
1819 if (b <= a) { // take a
1820 if (a == UNICODESET_HIGH) goto loop_end;
1821 buffer[k++] = a;
1822 } else { // take b
1823 if (b == UNICODESET_HIGH) goto loop_end;
1824 buffer[k++] = b;
1825 }
1826 a = list[i++];
1827 polarity ^= 1; // factored common code
1828 b = other[j++];
1829 polarity ^= 2;
1830 break;
1831 case 1: // a second, b first; if b < a, overlap
1832 if (a < b) { // no overlap, take a
1833 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1834 } else if (b < a) { // OVERLAP, drop b
1835 b = other[j++];
1836 polarity ^= 2;
1837 } else { // a == b, drop both!
1838 if (a == UNICODESET_HIGH) goto loop_end;
1839 a = list[i++];
1840 polarity ^= 1;
1841 b = other[j++];
1842 polarity ^= 2;
1843 }
1844 break;
1845 case 2: // a first, b second; if a < b, overlap
1846 if (b < a) { // no overlap, take b
1847 buffer[k++] = b;
1848 b = other[j++];
1849 polarity ^= 2;
1850 } else if (a < b) { // OVERLAP, drop a
1851 a = list[i++];
1852 polarity ^= 1;
1853 } else { // a == b, drop both!
1854 if (a == UNICODESET_HIGH) goto loop_end;
1855 a = list[i++];
1856 polarity ^= 1;
1857 b = other[j++];
1858 polarity ^= 2;
1859 }
1860 break;
1861 }
1862 }
1863 loop_end:
1864 buffer[k++] = UNICODESET_HIGH; // terminate
1865 len = k;
1866 swapBuffers();
1867 releasePattern();
1868 }
1869
1870 // polarity = 0 is normal: x intersect y
1871 // polarity = 2: x intersect ~y == set-minus
1872 // polarity = 1: ~x intersect y
1873 // polarity = 3: ~x intersect ~y
1874
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1875 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1876 if (isFrozen() || isBogus()) {
1877 return;
1878 }
1879 UErrorCode status = U_ZERO_ERROR;
1880 ensureBufferCapacity(len + otherLen, status);
1881 if (U_FAILURE(status)) {
1882 return;
1883 }
1884
1885 int32_t i = 0, j = 0, k = 0;
1886 UChar32 a = list[i++];
1887 UChar32 b = other[j++];
1888 // change from xor is that we have to check overlapping pairs
1889 // polarity bit 1 means a is second, bit 2 means b is.
1890 for (;;) {
1891 switch (polarity) {
1892 case 0: // both first; drop the smaller
1893 if (a < b) { // drop a
1894 a = list[i++];
1895 polarity ^= 1;
1896 } else if (b < a) { // drop b
1897 b = other[j++];
1898 polarity ^= 2;
1899 } else { // a == b, take one, drop other
1900 if (a == UNICODESET_HIGH) goto loop_end;
1901 buffer[k++] = a;
1902 a = list[i++];
1903 polarity ^= 1;
1904 b = other[j++];
1905 polarity ^= 2;
1906 }
1907 break;
1908 case 3: // both second; take lower if unequal
1909 if (a < b) { // take a
1910 buffer[k++] = a;
1911 a = list[i++];
1912 polarity ^= 1;
1913 } else if (b < a) { // take b
1914 buffer[k++] = b;
1915 b = other[j++];
1916 polarity ^= 2;
1917 } else { // a == b, take one, drop other
1918 if (a == UNICODESET_HIGH) goto loop_end;
1919 buffer[k++] = a;
1920 a = list[i++];
1921 polarity ^= 1;
1922 b = other[j++];
1923 polarity ^= 2;
1924 }
1925 break;
1926 case 1: // a second, b first;
1927 if (a < b) { // NO OVERLAP, drop a
1928 a = list[i++];
1929 polarity ^= 1;
1930 } else if (b < a) { // OVERLAP, take b
1931 buffer[k++] = b;
1932 b = other[j++];
1933 polarity ^= 2;
1934 } else { // a == b, drop both!
1935 if (a == UNICODESET_HIGH) goto loop_end;
1936 a = list[i++];
1937 polarity ^= 1;
1938 b = other[j++];
1939 polarity ^= 2;
1940 }
1941 break;
1942 case 2: // a first, b second; if a < b, overlap
1943 if (b < a) { // no overlap, drop b
1944 b = other[j++];
1945 polarity ^= 2;
1946 } else if (a < b) { // OVERLAP, take a
1947 buffer[k++] = a;
1948 a = list[i++];
1949 polarity ^= 1;
1950 } else { // a == b, drop both!
1951 if (a == UNICODESET_HIGH) goto loop_end;
1952 a = list[i++];
1953 polarity ^= 1;
1954 b = other[j++];
1955 polarity ^= 2;
1956 }
1957 break;
1958 }
1959 }
1960 loop_end:
1961 buffer[k++] = UNICODESET_HIGH; // terminate
1962 len = k;
1963 swapBuffers();
1964 releasePattern();
1965 }
1966
1967 /**
1968 * Append the <code>toPattern()</code> representation of a
1969 * string to the given <code>StringBuffer</code>.
1970 */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1971 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1972 escapeUnprintable) {
1973 UChar32 cp;
1974 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1975 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1976 }
1977 }
1978
1979 /**
1980 * Append the <code>toPattern()</code> representation of a
1981 * character to the given <code>StringBuffer</code>.
1982 */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1983 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1984 escapeUnprintable) {
1985 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1986 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1987 // unprintable
1988 if (ICU_Utility::escapeUnprintable(buf, c)) {
1989 return;
1990 }
1991 }
1992 // Okay to let ':' pass through
1993 switch (c) {
1994 case SET_OPEN:
1995 case SET_CLOSE:
1996 case HYPHEN:
1997 case COMPLEMENT:
1998 case INTERSECTION:
1999 case BACKSLASH:
2000 case OPEN_BRACE:
2001 case CLOSE_BRACE:
2002 case COLON:
2003 case SymbolTable::SYMBOL_REF:
2004 buf.append(BACKSLASH);
2005 break;
2006 default:
2007 // Escape whitespace
2008 if (PatternProps::isWhiteSpace(c)) {
2009 buf.append(BACKSLASH);
2010 }
2011 break;
2012 }
2013 buf.append(c);
2014 }
2015
2016 /**
2017 * Append a string representation of this set to result. This will be
2018 * a cleaned version of the string passed to applyPattern(), if there
2019 * is one. Otherwise it will be generated.
2020 */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const2021 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2022 UBool escapeUnprintable) const
2023 {
2024 if (pat != NULL) {
2025 int32_t i;
2026 int32_t backslashCount = 0;
2027 for (i=0; i<patLen; ) {
2028 UChar32 c;
2029 U16_NEXT(pat, i, patLen, c);
2030 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2031 // If the unprintable character is preceded by an odd
2032 // number of backslashes, then it has been escaped.
2033 // Before unescaping it, we delete the final
2034 // backslash.
2035 if ((backslashCount % 2) == 1) {
2036 result.truncate(result.length() - 1);
2037 }
2038 ICU_Utility::escapeUnprintable(result, c);
2039 backslashCount = 0;
2040 } else {
2041 result.append(c);
2042 if (c == BACKSLASH) {
2043 ++backslashCount;
2044 } else {
2045 backslashCount = 0;
2046 }
2047 }
2048 }
2049 return result;
2050 }
2051
2052 return _generatePattern(result, escapeUnprintable);
2053 }
2054
2055 /**
2056 * Returns a string representation of this set. If the result of
2057 * calling this function is passed to a UnicodeSet constructor, it
2058 * will produce another set that is equal to this one.
2059 */
toPattern(UnicodeString & result,UBool escapeUnprintable) const2060 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2061 UBool escapeUnprintable) const
2062 {
2063 result.truncate(0);
2064 return _toPattern(result, escapeUnprintable);
2065 }
2066
2067 /**
2068 * Generate and append a string representation of this set to result.
2069 * This does not use this.pat, the cleaned up copy of the string
2070 * passed to applyPattern().
2071 */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const2072 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2073 UBool escapeUnprintable) const
2074 {
2075 result.append(SET_OPEN);
2076
2077 // // Check against the predefined categories. We implicitly build
2078 // // up ALL category sets the first time toPattern() is called.
2079 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2080 // if (*this == getCategorySet(cat)) {
2081 // result.append(COLON);
2082 // result.append(CATEGORY_NAMES, cat*2, 2);
2083 // return result.append(CATEGORY_CLOSE);
2084 // }
2085 // }
2086
2087 int32_t count = getRangeCount();
2088
2089 // If the set contains at least 2 intervals and includes both
2090 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2091 // be more economical.
2092 if (count > 1 &&
2093 getRangeStart(0) == MIN_VALUE &&
2094 getRangeEnd(count-1) == MAX_VALUE) {
2095
2096 // Emit the inverse
2097 result.append(COMPLEMENT);
2098
2099 for (int32_t i = 1; i < count; ++i) {
2100 UChar32 start = getRangeEnd(i-1)+1;
2101 UChar32 end = getRangeStart(i)-1;
2102 _appendToPat(result, start, escapeUnprintable);
2103 if (start != end) {
2104 if ((start+1) != end) {
2105 result.append(HYPHEN);
2106 }
2107 _appendToPat(result, end, escapeUnprintable);
2108 }
2109 }
2110 }
2111
2112 // Default; emit the ranges as pairs
2113 else {
2114 for (int32_t i = 0; i < count; ++i) {
2115 UChar32 start = getRangeStart(i);
2116 UChar32 end = getRangeEnd(i);
2117 _appendToPat(result, start, escapeUnprintable);
2118 if (start != end) {
2119 if ((start+1) != end) {
2120 result.append(HYPHEN);
2121 }
2122 _appendToPat(result, end, escapeUnprintable);
2123 }
2124 }
2125 }
2126
2127 for (int32_t i = 0; i<strings->size(); ++i) {
2128 result.append(OPEN_BRACE);
2129 _appendToPat(result,
2130 *(const UnicodeString*) strings->elementAt(i),
2131 escapeUnprintable);
2132 result.append(CLOSE_BRACE);
2133 }
2134 return result.append(SET_CLOSE);
2135 }
2136
2137 /**
2138 * Release existing cached pattern
2139 */
releasePattern()2140 void UnicodeSet::releasePattern() {
2141 if (pat) {
2142 uprv_free(pat);
2143 pat = NULL;
2144 patLen = 0;
2145 }
2146 }
2147
2148 /**
2149 * Set the new pattern to cache.
2150 */
setPattern(const UnicodeString & newPat)2151 void UnicodeSet::setPattern(const UnicodeString& newPat) {
2152 releasePattern();
2153 int32_t newPatLen = newPat.length();
2154 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2155 if (pat) {
2156 patLen = newPatLen;
2157 newPat.extractBetween(0, patLen, pat);
2158 pat[patLen] = 0;
2159 }
2160 // else we don't care if malloc failed. This was just a nice cache.
2161 // We can regenerate an equivalent pattern later when requested.
2162 }
2163
freeze()2164 UnicodeFunctor *UnicodeSet::freeze() {
2165 if(!isFrozen() && !isBogus()) {
2166 // Do most of what compact() does before freezing because
2167 // compact() will not work when the set is frozen.
2168 // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA).
2169
2170 // Delete buffer first to defragment memory less.
2171 if (buffer != NULL) {
2172 uprv_free(buffer);
2173 buffer = NULL;
2174 }
2175 if (capacity > (len + GROW_EXTRA)) {
2176 // Make the capacity equal to len or 1.
2177 // We don't want to realloc of 0 size.
2178 capacity = len + (len == 0);
2179 list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity);
2180 if (list == NULL) { // Check for memory allocation error.
2181 setToBogus();
2182 return this;
2183 }
2184 }
2185
2186 // Optimize contains() and span() and similar functions.
2187 if (!strings->isEmpty()) {
2188 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2189 if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) {
2190 // All strings are irrelevant for span() etc. because
2191 // all of each string's code points are contained in this set.
2192 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2193 // many relevant strings as UTF-16.
2194 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2195 delete stringSpan;
2196 stringSpan = NULL;
2197 }
2198 }
2199 if (stringSpan == NULL) {
2200 // No span-relevant strings: Optimize for code point spans.
2201 bmpSet=new BMPSet(list, len);
2202 if (bmpSet == NULL) { // Check for memory allocation error.
2203 setToBogus();
2204 }
2205 }
2206 }
2207 return this;
2208 }
2209
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2210 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2211 if(length>0 && bmpSet!=NULL) {
2212 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2213 }
2214 if(length<0) {
2215 length=u_strlen(s);
2216 }
2217 if(length==0) {
2218 return 0;
2219 }
2220 if(stringSpan!=NULL) {
2221 return stringSpan->span(s, length, spanCondition);
2222 } else if(!strings->isEmpty()) {
2223 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2224 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2225 UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2226 UnicodeSetStringSpan strSpan(*this, *strings, which);
2227 if(strSpan.needsStringSpanUTF16()) {
2228 return strSpan.span(s, length, spanCondition);
2229 }
2230 }
2231
2232 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2233 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2234 }
2235
2236 UChar32 c;
2237 int32_t start=0, prev=0;
2238 do {
2239 U16_NEXT(s, start, length, c);
2240 if(spanCondition!=contains(c)) {
2241 break;
2242 }
2243 } while((prev=start)<length);
2244 return prev;
2245 }
2246
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2247 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2248 if(length>0 && bmpSet!=NULL) {
2249 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2250 }
2251 if(length<0) {
2252 length=u_strlen(s);
2253 }
2254 if(length==0) {
2255 return 0;
2256 }
2257 if(stringSpan!=NULL) {
2258 return stringSpan->spanBack(s, length, spanCondition);
2259 } else if(!strings->isEmpty()) {
2260 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2261 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2262 UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2263 UnicodeSetStringSpan strSpan(*this, *strings, which);
2264 if(strSpan.needsStringSpanUTF16()) {
2265 return strSpan.spanBack(s, length, spanCondition);
2266 }
2267 }
2268
2269 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2270 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2271 }
2272
2273 UChar32 c;
2274 int32_t prev=length;
2275 do {
2276 U16_PREV(s, 0, length, c);
2277 if(spanCondition!=contains(c)) {
2278 break;
2279 }
2280 } while((prev=length)>0);
2281 return prev;
2282 }
2283
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2284 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2285 if(length>0 && bmpSet!=NULL) {
2286 const uint8_t *s0=(const uint8_t *)s;
2287 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2288 }
2289 if(length<0) {
2290 length=(int32_t)uprv_strlen(s);
2291 }
2292 if(length==0) {
2293 return 0;
2294 }
2295 if(stringSpan!=NULL) {
2296 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2297 } else if(!strings->isEmpty()) {
2298 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2299 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2300 UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2301 UnicodeSetStringSpan strSpan(*this, *strings, which);
2302 if(strSpan.needsStringSpanUTF8()) {
2303 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2304 }
2305 }
2306
2307 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2308 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2309 }
2310
2311 UChar32 c;
2312 int32_t start=0, prev=0;
2313 do {
2314 U8_NEXT_OR_FFFD(s, start, length, c);
2315 if(spanCondition!=contains(c)) {
2316 break;
2317 }
2318 } while((prev=start)<length);
2319 return prev;
2320 }
2321
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2322 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2323 if(length>0 && bmpSet!=NULL) {
2324 const uint8_t *s0=(const uint8_t *)s;
2325 return bmpSet->spanBackUTF8(s0, length, spanCondition);
2326 }
2327 if(length<0) {
2328 length=(int32_t)uprv_strlen(s);
2329 }
2330 if(length==0) {
2331 return 0;
2332 }
2333 if(stringSpan!=NULL) {
2334 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2335 } else if(!strings->isEmpty()) {
2336 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2337 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2338 UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2339 UnicodeSetStringSpan strSpan(*this, *strings, which);
2340 if(strSpan.needsStringSpanUTF8()) {
2341 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2342 }
2343 }
2344
2345 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2346 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2347 }
2348
2349 UChar32 c;
2350 int32_t prev=length;
2351 do {
2352 U8_PREV_OR_FFFD(s, 0, length, c);
2353 if(spanCondition!=contains(c)) {
2354 break;
2355 }
2356 } while((prev=length)>0);
2357 return prev;
2358 }
2359
2360 U_NAMESPACE_END
2361