1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1999-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14 #include "unicode/parsepos.h"
15 #include "unicode/symtable.h"
16 #include "unicode/uniset.h"
17 #include "unicode/ustring.h"
18 #include "unicode/utf8.h"
19 #include "unicode/utf16.h"
20 #include "ruleiter.h"
21 #include "cmemory.h"
22 #include "cstring.h"
23 #include "patternprops.h"
24 #include "uelement.h"
25 #include "util.h"
26 #include "uvector.h"
27 #include "charstr.h"
28 #include "ustrfmt.h"
29 #include "uassert.h"
30 #include "bmpset.h"
31 #include "unisetspan.h"
32
33 // HIGH_VALUE > all valid values. 110000 for codepoints
34 #define UNICODESET_HIGH 0x0110000
35
36 // LOW <= all valid values. ZERO for codepoints
37 #define UNICODESET_LOW 0x000000
38
39 /** Max list [0, 1, 2, ..., max code point, HIGH] */
40 constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
41
42 U_NAMESPACE_BEGIN
43
~SymbolTable()44 SymbolTable::~SymbolTable() {}
45
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)46 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
47
48 /**
49 * Modify the given UChar32 variable so that it is in range, by
50 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
51 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
52 * It modifies its argument in-place and also returns it.
53 */
54 static inline UChar32 pinCodePoint(UChar32& c) {
55 if (c < UNICODESET_LOW) {
56 c = UNICODESET_LOW;
57 } else if (c > (UNICODESET_HIGH-1)) {
58 c = (UNICODESET_HIGH-1);
59 }
60 return c;
61 }
62
63 //----------------------------------------------------------------
64 // Debugging
65 //----------------------------------------------------------------
66
67 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
68 // To enable the debugging, define the symbol DEBUG_MEM in the line
69 // below. This will result in text being sent to stdout that looks
70 // like this:
71 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
72 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
73 // Each line lists a construction (ct) or destruction (dt) event, the
74 // object address, the number of outstanding objects after the event,
75 // and the pattern of the object in question.
76
77 // #define DEBUG_MEM
78
79 #ifdef DEBUG_MEM
80 #include <stdio.h>
81 static int32_t _dbgCount = 0;
82
_dbgct(UnicodeSet * set)83 static inline void _dbgct(UnicodeSet* set) {
84 UnicodeString str;
85 set->toPattern(str, true);
86 char buf[40];
87 str.extract(0, 39, buf, "");
88 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
89 }
90
_dbgdt(UnicodeSet * set)91 static inline void _dbgdt(UnicodeSet* set) {
92 UnicodeString str;
93 set->toPattern(str, true);
94 char buf[40];
95 str.extract(0, 39, buf, "");
96 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
97 }
98
99 #else
100
101 #define _dbgct(set)
102 #define _dbgdt(set)
103
104 #endif
105
106 //----------------------------------------------------------------
107 // UnicodeString in UVector support
108 //----------------------------------------------------------------
109
cloneUnicodeString(UElement * dst,UElement * src)110 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
111 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
112 }
113
compareUnicodeString(UElement t1,UElement t2)114 static int32_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
115 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
116 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
117 return a.compare(b);
118 }
119
hasStrings() const120 UBool UnicodeSet::hasStrings() const {
121 return strings != nullptr && !strings->isEmpty();
122 }
123
stringsSize() const124 int32_t UnicodeSet::stringsSize() const {
125 return strings == nullptr ? 0 : strings->size();
126 }
127
stringsContains(const UnicodeString & s) const128 UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
129 return strings != nullptr && strings->contains((void*) &s);
130 }
131
132 //----------------------------------------------------------------
133 // Constructors &c
134 //----------------------------------------------------------------
135
136 /**
137 * Constructs an empty set.
138 */
UnicodeSet()139 UnicodeSet::UnicodeSet() {
140 list[0] = UNICODESET_HIGH;
141 _dbgct(this);
142 }
143
144 /**
145 * Constructs a set containing the given range. If <code>end >
146 * start</code> then an empty set is created.
147 *
148 * @param start first character, inclusive, of range
149 * @param end last character, inclusive, of range
150 */
UnicodeSet(UChar32 start,UChar32 end)151 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
152 list[0] = UNICODESET_HIGH;
153 add(start, end);
154 _dbgct(this);
155 }
156
157 /**
158 * Constructs a set that is identical to the given UnicodeSet.
159 */
UnicodeSet(const UnicodeSet & o)160 UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
161 *this = o;
162 _dbgct(this);
163 }
164
165 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)166 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
167 if (ensureCapacity(o.len)) {
168 // *this = o except for bmpSet and stringSpan
169 len = o.len;
170 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
171 if (o.hasStrings()) {
172 UErrorCode status = U_ZERO_ERROR;
173 if (!allocateStrings(status) ||
174 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
175 setToBogus();
176 return;
177 }
178 }
179 if (o.pat) {
180 setPattern(o.pat, o.patLen);
181 }
182 _dbgct(this);
183 }
184 }
185
186 /**
187 * Destructs the set.
188 */
~UnicodeSet()189 UnicodeSet::~UnicodeSet() {
190 _dbgdt(this); // first!
191 if (list != stackList) {
192 uprv_free(list);
193 }
194 delete bmpSet;
195 if (buffer != stackList) {
196 uprv_free(buffer);
197 }
198 delete strings;
199 delete stringSpan;
200 releasePattern();
201 }
202
203 /**
204 * Assigns this object to be a copy of another.
205 */
operator =(const UnicodeSet & o)206 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
207 return copyFrom(o, false);
208 }
209
copyFrom(const UnicodeSet & o,UBool asThawed)210 UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
211 if (this == &o) {
212 return *this;
213 }
214 if (isFrozen()) {
215 return *this;
216 }
217 if (o.isBogus()) {
218 setToBogus();
219 return *this;
220 }
221 if (!ensureCapacity(o.len)) {
222 // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
223 return *this;
224 }
225 len = o.len;
226 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
227 if (o.bmpSet != nullptr && !asThawed) {
228 bmpSet = new BMPSet(*o.bmpSet, list, len);
229 if (bmpSet == nullptr) { // Check for memory allocation error.
230 setToBogus();
231 return *this;
232 }
233 }
234 if (o.hasStrings()) {
235 UErrorCode status = U_ZERO_ERROR;
236 if ((strings == nullptr && !allocateStrings(status)) ||
237 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
238 setToBogus();
239 return *this;
240 }
241 } else if (hasStrings()) {
242 strings->removeAllElements();
243 }
244 if (o.stringSpan != nullptr && !asThawed) {
245 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
246 if (stringSpan == nullptr) { // Check for memory allocation error.
247 setToBogus();
248 return *this;
249 }
250 }
251 releasePattern();
252 if (o.pat) {
253 setPattern(o.pat, o.patLen);
254 }
255 return *this;
256 }
257
258 /**
259 * Returns a copy of this object. All UnicodeMatcher objects have
260 * to support cloning in order to allow classes using
261 * UnicodeMatchers, such as Transliterator, to implement cloning.
262 */
clone() const263 UnicodeSet* UnicodeSet::clone() const {
264 return new UnicodeSet(*this);
265 }
266
cloneAsThawed() const267 UnicodeSet *UnicodeSet::cloneAsThawed() const {
268 return new UnicodeSet(*this, true);
269 }
270
271 /**
272 * Compares the specified object with this set for equality. Returns
273 * <tt>true</tt> if the two sets
274 * have the same size, and every member of the specified set is
275 * contained in this set (or equivalently, every member of this set is
276 * contained in the specified set).
277 *
278 * @param o set to be compared for equality with this set.
279 * @return <tt>true</tt> if the specified set is equal to this set.
280 */
operator ==(const UnicodeSet & o) const281 bool UnicodeSet::operator==(const UnicodeSet& o) const {
282 if (len != o.len) return false;
283 for (int32_t i = 0; i < len; ++i) {
284 if (list[i] != o.list[i]) return false;
285 }
286 if (hasStrings() != o.hasStrings()) { return false; }
287 if (hasStrings() && *strings != *o.strings) return false;
288 return true;
289 }
290
291 /**
292 * Returns the hash code value for this set.
293 *
294 * @return the hash code value for this set.
295 * @see Object#hashCode()
296 */
hashCode() const297 int32_t UnicodeSet::hashCode() const {
298 uint32_t result = static_cast<uint32_t>(len);
299 for (int32_t i = 0; i < len; ++i) {
300 result *= 1000003u;
301 result += list[i];
302 }
303 return static_cast<int32_t>(result);
304 }
305
306 //----------------------------------------------------------------
307 // Public API
308 //----------------------------------------------------------------
309
310 /**
311 * Returns the number of elements in this set (its cardinality),
312 * Note than the elements of a set may include both individual
313 * codepoints and strings.
314 *
315 * @return the number of elements in this set (its cardinality).
316 */
size() const317 int32_t UnicodeSet::size() const {
318 int32_t n = 0;
319 int32_t count = getRangeCount();
320 for (int32_t i = 0; i < count; ++i) {
321 n += getRangeEnd(i) - getRangeStart(i) + 1;
322 }
323 return n + stringsSize();
324 }
325
326 /**
327 * Returns <tt>true</tt> if this set contains no elements.
328 *
329 * @return <tt>true</tt> if this set contains no elements.
330 */
isEmpty() const331 UBool UnicodeSet::isEmpty() const {
332 return len == 1 && !hasStrings();
333 }
334
335 /**
336 * Returns true if this set contains the given character.
337 * @param c character to be checked for containment
338 * @return true if the test condition is met
339 */
contains(UChar32 c) const340 UBool UnicodeSet::contains(UChar32 c) const {
341 // Set i to the index of the start item greater than ch
342 // We know we will terminate without length test!
343 // LATER: for large sets, add binary search
344 //int32_t i = -1;
345 //for (;;) {
346 // if (c < list[++i]) break;
347 //}
348 if (bmpSet != nullptr) {
349 return bmpSet->contains(c);
350 }
351 if (stringSpan != nullptr) {
352 return stringSpan->contains(c);
353 }
354 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
355 return false;
356 }
357 int32_t i = findCodePoint(c);
358 return (UBool)(i & 1); // return true if odd
359 }
360
361 /**
362 * Returns the smallest value i such that c < list[i]. Caller
363 * must ensure that c is a legal value or this method will enter
364 * an infinite loop. This method performs a binary search.
365 * @param c a character in the range MIN_VALUE..MAX_VALUE
366 * inclusive
367 * @return the smallest integer i in the range 0..len-1,
368 * inclusive, such that c < list[i]
369 */
findCodePoint(UChar32 c) const370 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
371 /* Examples:
372 findCodePoint(c)
373 set list[] c=0 1 3 4 7 8
374 === ============== ===========
375 [] [110000] 0 0 0 0 0 0
376 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
377 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
378 [:Any:] [0, 110000] 1 1 1 1 1 1
379 */
380
381 // Return the smallest i such that c < list[i]. Assume
382 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
383 if (c < list[0])
384 return 0;
385 // High runner test. c is often after the last range, so an
386 // initial check for this condition pays off.
387 int32_t lo = 0;
388 int32_t hi = len - 1;
389 if (lo >= hi || c >= list[hi-1])
390 return hi;
391 // invariant: c >= list[lo]
392 // invariant: c < list[hi]
393 for (;;) {
394 int32_t i = (lo + hi) >> 1;
395 if (i == lo) {
396 break; // Found!
397 } else if (c < list[i]) {
398 hi = i;
399 } else {
400 lo = i;
401 }
402 }
403 return hi;
404 }
405
406 /**
407 * Returns true if this set contains every character
408 * of the given range.
409 * @param start first character, inclusive, of the range
410 * @param end last character, inclusive, of the range
411 * @return true if the test condition is met
412 */
contains(UChar32 start,UChar32 end) const413 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
414 //int32_t i = -1;
415 //for (;;) {
416 // if (start < list[++i]) break;
417 //}
418 int32_t i = findCodePoint(start);
419 return ((i & 1) != 0 && end < list[i]);
420 }
421
422 /**
423 * Returns <tt>true</tt> if this set contains the given
424 * multicharacter string.
425 * @param s string to be checked for containment
426 * @return <tt>true</tt> if this set contains the specified string
427 */
contains(const UnicodeString & s) const428 UBool UnicodeSet::contains(const UnicodeString& s) const {
429 int32_t cp = getSingleCP(s);
430 if (cp < 0) {
431 return stringsContains(s);
432 } else {
433 return contains((UChar32) cp);
434 }
435 }
436
437 /**
438 * Returns true if this set contains all the characters and strings
439 * of the given set.
440 * @param c set to be checked for containment
441 * @return true if the test condition is met
442 */
containsAll(const UnicodeSet & c) const443 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
444 // The specified set is a subset if all of its pairs are contained in
445 // this set. It's possible to code this more efficiently in terms of
446 // direct manipulation of the inversion lists if the need arises.
447 int32_t n = c.getRangeCount();
448 for (int i=0; i<n; ++i) {
449 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
450 return false;
451 }
452 }
453 return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
454 }
455
456 /**
457 * Returns true if this set contains all the characters
458 * of the given string.
459 * @param s string containing characters to be checked for containment
460 * @return true if the test condition is met
461 */
containsAll(const UnicodeString & s) const462 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
463 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
464 s.length());
465 }
466
467 /**
468 * Returns true if this set contains none of the characters
469 * of the given range.
470 * @param start first character, inclusive, of the range
471 * @param end last character, inclusive, of the range
472 * @return true if the test condition is met
473 */
containsNone(UChar32 start,UChar32 end) const474 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
475 //int32_t i = -1;
476 //for (;;) {
477 // if (start < list[++i]) break;
478 //}
479 int32_t i = findCodePoint(start);
480 return ((i & 1) == 0 && end < list[i]);
481 }
482
483 /**
484 * Returns true if this set contains none of the characters and strings
485 * of the given set.
486 * @param c set to be checked for containment
487 * @return true if the test condition is met
488 */
containsNone(const UnicodeSet & c) const489 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
490 // The specified set is a subset if all of its pairs are contained in
491 // this set. It's possible to code this more efficiently in terms of
492 // direct manipulation of the inversion lists if the need arises.
493 int32_t n = c.getRangeCount();
494 for (int32_t i=0; i<n; ++i) {
495 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
496 return false;
497 }
498 }
499 return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
500 }
501
502 /**
503 * Returns true if this set contains none of the characters
504 * of the given string.
505 * @param s string containing characters to be checked for containment
506 * @return true if the test condition is met
507 */
containsNone(const UnicodeString & s) const508 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
509 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
510 s.length());
511 }
512
513 /**
514 * Returns <tt>true</tt> if this set contains any character whose low byte
515 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
516 * indexing.
517 */
matchesIndexValue(uint8_t v) const518 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
519 /* The index value v, in the range [0,255], is contained in this set if
520 * it is contained in any pair of this set. Pairs either have the high
521 * bytes equal, or unequal. If the high bytes are equal, then we have
522 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
523 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
524 * Then v is contained if xx <= v || v <= yy. (This is identical to the
525 * time zone month containment logic.)
526 */
527 int32_t i;
528 int32_t rangeCount=getRangeCount();
529 for (i=0; i<rangeCount; ++i) {
530 UChar32 low = getRangeStart(i);
531 UChar32 high = getRangeEnd(i);
532 if ((low & ~0xFF) == (high & ~0xFF)) {
533 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
534 return true;
535 }
536 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
537 return true;
538 }
539 }
540 if (hasStrings()) {
541 for (i=0; i<strings->size(); ++i) {
542 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
543 if (s.isEmpty()) {
544 continue; // skip the empty string
545 }
546 UChar32 c = s.char32At(0);
547 if ((c & 0xFF) == v) {
548 return true;
549 }
550 }
551 }
552 return false;
553 }
554
555 /**
556 * Implementation of UnicodeMatcher::matches(). Always matches the
557 * longest possible multichar string.
558 */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)559 UMatchDegree UnicodeSet::matches(const Replaceable& text,
560 int32_t& offset,
561 int32_t limit,
562 UBool incremental) {
563 if (offset == limit) {
564 if (contains(U_ETHER)) {
565 return incremental ? U_PARTIAL_MATCH : U_MATCH;
566 } else {
567 return U_MISMATCH;
568 }
569 } else {
570 if (hasStrings()) { // try strings first
571
572 // might separate forward and backward loops later
573 // for now they are combined
574
575 // TODO Improve efficiency of this, at least in the forward
576 // direction, if not in both. In the forward direction we
577 // can assume the strings are sorted.
578
579 int32_t i;
580 UBool forward = offset < limit;
581
582 // firstChar is the leftmost char to match in the
583 // forward direction or the rightmost char to match in
584 // the reverse direction.
585 char16_t firstChar = text.charAt(offset);
586
587 // If there are multiple strings that can match we
588 // return the longest match.
589 int32_t highWaterLength = 0;
590
591 for (i=0; i<strings->size(); ++i) {
592 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
593 if (trial.isEmpty()) {
594 continue; // skip the empty string
595 }
596
597 char16_t c = trial.charAt(forward ? 0 : trial.length() - 1);
598
599 // Strings are sorted, so we can optimize in the
600 // forward direction.
601 if (forward && c > firstChar) break;
602 if (c != firstChar) continue;
603
604 int32_t matchLen = matchRest(text, offset, limit, trial);
605
606 if (incremental) {
607 int32_t maxLen = forward ? limit-offset : offset-limit;
608 if (matchLen == maxLen) {
609 // We have successfully matched but only up to limit.
610 return U_PARTIAL_MATCH;
611 }
612 }
613
614 if (matchLen == trial.length()) {
615 // We have successfully matched the whole string.
616 if (matchLen > highWaterLength) {
617 highWaterLength = matchLen;
618 }
619 // In the forward direction we know strings
620 // are sorted so we can bail early.
621 if (forward && matchLen < highWaterLength) {
622 break;
623 }
624 continue;
625 }
626 }
627
628 // We've checked all strings without a partial match.
629 // If we have full matches, return the longest one.
630 if (highWaterLength != 0) {
631 offset += forward ? highWaterLength : -highWaterLength;
632 return U_MATCH;
633 }
634 }
635 return UnicodeFilter::matches(text, offset, limit, incremental);
636 }
637 }
638
639 /**
640 * Returns the longest match for s in text at the given position.
641 * If limit > start then match forward from start+1 to limit
642 * matching all characters except s.charAt(0). If limit < start,
643 * go backward starting from start-1 matching all characters
644 * except s.charAt(s.length()-1). This method assumes that the
645 * first character, text.charAt(start), matches s, so it does not
646 * check it.
647 * @param text the text to match
648 * @param start the first character to match. In the forward
649 * direction, text.charAt(start) is matched against s.charAt(0).
650 * In the reverse direction, it is matched against
651 * s.charAt(s.length()-1).
652 * @param limit the limit offset for matching, either last+1 in
653 * the forward direction, or last-1 in the reverse direction,
654 * where last is the index of the last character to match.
655 * @return If part of s matches up to the limit, return |limit -
656 * start|. If all of s matches before reaching the limit, return
657 * s.length(). If there is a mismatch between s and text, return
658 * 0
659 */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)660 int32_t UnicodeSet::matchRest(const Replaceable& text,
661 int32_t start, int32_t limit,
662 const UnicodeString& s) {
663 int32_t i;
664 int32_t maxLen;
665 int32_t slen = s.length();
666 if (start < limit) {
667 maxLen = limit - start;
668 if (maxLen > slen) maxLen = slen;
669 for (i = 1; i < maxLen; ++i) {
670 if (text.charAt(start + i) != s.charAt(i)) return 0;
671 }
672 } else {
673 maxLen = start - limit;
674 if (maxLen > slen) maxLen = slen;
675 --slen; // <=> slen = s.length() - 1;
676 for (i = 1; i < maxLen; ++i) {
677 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
678 }
679 }
680 return maxLen;
681 }
682
683 /**
684 * Implement of UnicodeMatcher
685 */
addMatchSetTo(UnicodeSet & toUnionTo) const686 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
687 toUnionTo.addAll(*this);
688 }
689
690 /**
691 * Returns the index of the given character within this set, where
692 * the set is ordered by ascending code point. If the character
693 * is not in this set, return -1. The inverse of this method is
694 * <code>charAt()</code>.
695 * @return an index from 0..size()-1, or -1
696 */
indexOf(UChar32 c) const697 int32_t UnicodeSet::indexOf(UChar32 c) const {
698 if (c < MIN_VALUE || c > MAX_VALUE) {
699 return -1;
700 }
701 int32_t i = 0;
702 int32_t n = 0;
703 for (;;) {
704 UChar32 start = list[i++];
705 if (c < start) {
706 return -1;
707 }
708 UChar32 limit = list[i++];
709 if (c < limit) {
710 return n + c - start;
711 }
712 n += limit - start;
713 }
714 }
715
716 /**
717 * Returns the character at the given index within this set, where
718 * the set is ordered by ascending code point. If the index is
719 * out of range, return (UChar32)-1. The inverse of this method is
720 * <code>indexOf()</code>.
721 * @param index an index from 0..size()-1
722 * @return the character at the given index, or (UChar32)-1.
723 */
charAt(int32_t index) const724 UChar32 UnicodeSet::charAt(int32_t index) const {
725 if (index >= 0) {
726 // len2 is the largest even integer <= len, that is, it is len
727 // for even values and len-1 for odd values. With odd values
728 // the last entry is UNICODESET_HIGH.
729 int32_t len2 = len & ~1;
730 for (int32_t i=0; i < len2;) {
731 UChar32 start = list[i++];
732 int32_t count = list[i++] - start;
733 if (index < count) {
734 return (UChar32)(start + index);
735 }
736 index -= count;
737 }
738 }
739 return (UChar32)-1;
740 }
741
742 /**
743 * Make this object represent the range <code>start - end</code>.
744 * If <code>end > start</code> then this object is set to an
745 * an empty range.
746 *
747 * @param start first character in the set, inclusive
748 * @rparam end last character in the set, inclusive
749 */
set(UChar32 start,UChar32 end)750 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
751 clear();
752 complement(start, end);
753 return *this;
754 }
755
756 /**
757 * Adds the specified range to this set if it is not already
758 * present. If this set already contains the specified range,
759 * the call leaves this set unchanged. If <code>end > start</code>
760 * then an empty range is added, leaving the set unchanged.
761 *
762 * @param start first character, inclusive, of range to be added
763 * to this set.
764 * @param end last character, inclusive, of range to be added
765 * to this set.
766 */
add(UChar32 start,UChar32 end)767 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
768 if (pinCodePoint(start) < pinCodePoint(end)) {
769 UChar32 limit = end + 1;
770 // Fast path for adding a new range after the last one.
771 // Odd list length: [..., lastStart, lastLimit, HIGH]
772 if ((len & 1) != 0) {
773 // If the list is empty, set lastLimit low enough to not be adjacent to 0.
774 UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
775 if (lastLimit <= start && !isFrozen() && !isBogus()) {
776 if (lastLimit == start) {
777 // Extend the last range.
778 list[len - 2] = limit;
779 if (limit == UNICODESET_HIGH) {
780 --len;
781 }
782 } else {
783 list[len - 1] = start;
784 if (limit < UNICODESET_HIGH) {
785 if (ensureCapacity(len + 2)) {
786 list[len++] = limit;
787 list[len++] = UNICODESET_HIGH;
788 }
789 } else { // limit == UNICODESET_HIGH
790 if (ensureCapacity(len + 1)) {
791 list[len++] = UNICODESET_HIGH;
792 }
793 }
794 }
795 releasePattern();
796 return *this;
797 }
798 }
799 // This is slow. Could be much faster using findCodePoint(start)
800 // and modifying the list, dealing with adjacent & overlapping ranges.
801 UChar32 range[3] = { start, limit, UNICODESET_HIGH };
802 add(range, 2, 0);
803 } else if (start == end) {
804 add(start);
805 }
806 return *this;
807 }
808
809 // #define DEBUG_US_ADD
810
811 #ifdef DEBUG_US_ADD
812 #include <stdio.h>
dump(UChar32 c)813 void dump(UChar32 c) {
814 if (c <= 0xFF) {
815 printf("%c", (char)c);
816 } else {
817 printf("U+%04X", c);
818 }
819 }
dump(const UChar32 * list,int32_t len)820 void dump(const UChar32* list, int32_t len) {
821 printf("[");
822 for (int32_t i=0; i<len; ++i) {
823 if (i != 0) printf(", ");
824 dump(list[i]);
825 }
826 printf("]");
827 }
828 #endif
829
830 /**
831 * Adds the specified character to this set if it is not already
832 * present. If this set already contains the specified character,
833 * the call leaves this set unchanged.
834 */
add(UChar32 c)835 UnicodeSet& UnicodeSet::add(UChar32 c) {
836 // find smallest i such that c < list[i]
837 // if odd, then it is IN the set
838 // if even, then it is OUT of the set
839 int32_t i = findCodePoint(pinCodePoint(c));
840
841 // already in set?
842 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
843
844 // HIGH is 0x110000
845 // assert(list[len-1] == HIGH);
846
847 // empty = [HIGH]
848 // [start_0, limit_0, start_1, limit_1, HIGH]
849
850 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
851 // ^
852 // list[i]
853
854 // i == 0 means c is before the first range
855
856 #ifdef DEBUG_US_ADD
857 printf("Add of ");
858 dump(c);
859 printf(" found at %d", i);
860 printf(": ");
861 dump(list, len);
862 printf(" => ");
863 #endif
864
865 if (c == list[i]-1) {
866 // c is before start of next range
867 list[i] = c;
868 // if we touched the HIGH mark, then add a new one
869 if (c == (UNICODESET_HIGH - 1)) {
870 if (!ensureCapacity(len+1)) {
871 // ensureCapacity will mark the object as Bogus if OOM failure happens.
872 return *this;
873 }
874 list[len++] = UNICODESET_HIGH;
875 }
876 if (i > 0 && c == list[i-1]) {
877 // collapse adjacent ranges
878
879 // [..., start_k-1, c, c, limit_k, ..., HIGH]
880 // ^
881 // list[i]
882
883 //for (int32_t k=i-1; k<len-2; ++k) {
884 // list[k] = list[k+2];
885 //}
886 UChar32* dst = list + i - 1;
887 UChar32* src = dst + 2;
888 UChar32* srclimit = list + len;
889 while (src < srclimit) *(dst++) = *(src++);
890
891 len -= 2;
892 }
893 }
894
895 else if (i > 0 && c == list[i-1]) {
896 // c is after end of prior range
897 list[i-1]++;
898 // no need to check for collapse here
899 }
900
901 else {
902 // At this point we know the new char is not adjacent to
903 // any existing ranges, and it is not 10FFFF.
904
905
906 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
907 // ^
908 // list[i]
909
910 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
911 // ^
912 // list[i]
913
914 if (!ensureCapacity(len+2)) {
915 // ensureCapacity will mark the object as Bogus if OOM failure happens.
916 return *this;
917 }
918
919 UChar32 *p = list + i;
920 uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
921 list[i] = c;
922 list[i+1] = c+1;
923 len += 2;
924 }
925
926 #ifdef DEBUG_US_ADD
927 dump(list, len);
928 printf("\n");
929
930 for (i=1; i<len; ++i) {
931 if (list[i] <= list[i-1]) {
932 // Corrupt array!
933 printf("ERROR: list has been corrupted\n");
934 exit(1);
935 }
936 }
937 #endif
938
939 releasePattern();
940 return *this;
941 }
942
943 /**
944 * Adds the specified multicharacter to this set if it is not already
945 * present. If this set already contains the multicharacter,
946 * the call leaves this set unchanged.
947 * Thus "ch" => {"ch"}
948 *
949 * @param s the source string
950 * @return the modified set, for chaining
951 */
add(const UnicodeString & s)952 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
953 if (isFrozen() || isBogus()) return *this;
954 int32_t cp = getSingleCP(s);
955 if (cp < 0) {
956 if (!stringsContains(s)) {
957 _add(s);
958 releasePattern();
959 }
960 } else {
961 add((UChar32)cp);
962 }
963 return *this;
964 }
965
966 /**
967 * Adds the given string, in order, to 'strings'. The given string
968 * must have been checked by the caller to not already be in 'strings'.
969 */
_add(const UnicodeString & s)970 void UnicodeSet::_add(const UnicodeString& s) {
971 if (isFrozen() || isBogus()) {
972 return;
973 }
974 UErrorCode ec = U_ZERO_ERROR;
975 if (strings == nullptr && !allocateStrings(ec)) {
976 setToBogus();
977 return;
978 }
979 UnicodeString* t = new UnicodeString(s);
980 if (t == nullptr) { // Check for memory allocation error.
981 setToBogus();
982 return;
983 }
984 strings->sortedInsert(t, compareUnicodeString, ec);
985 if (U_FAILURE(ec)) {
986 setToBogus();
987 }
988 }
989
990 /**
991 * @return a code point IF the string consists of a single one.
992 * otherwise returns -1.
993 * @param string to test
994 */
getSingleCP(const UnicodeString & s)995 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
996 int32_t sLength = s.length();
997 if (sLength == 1) return s.charAt(0);
998 if (sLength == 2) {
999 UChar32 cp = s.char32At(0);
1000 if (cp > 0xFFFF) { // is surrogate pair
1001 return cp;
1002 }
1003 }
1004 return -1;
1005 }
1006
1007 /**
1008 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1009 * If this set already any particular character, it has no effect on that character.
1010 * @param the source string
1011 * @return the modified set, for chaining
1012 */
addAll(const UnicodeString & s)1013 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1014 UChar32 cp;
1015 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1016 cp = s.char32At(i);
1017 add(cp);
1018 }
1019 return *this;
1020 }
1021
1022 /**
1023 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1024 * If this set already any particular character, it has no effect on that character.
1025 * @param the source string
1026 * @return the modified set, for chaining
1027 */
retainAll(const UnicodeString & s)1028 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1029 UnicodeSet set;
1030 set.addAll(s);
1031 retainAll(set);
1032 return *this;
1033 }
1034
1035 /**
1036 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1037 * If this set already any particular character, it has no effect on that character.
1038 * @param the source string
1039 * @return the modified set, for chaining
1040 */
complementAll(const UnicodeString & s)1041 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1042 UnicodeSet set;
1043 set.addAll(s);
1044 complementAll(set);
1045 return *this;
1046 }
1047
1048 /**
1049 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1050 * If this set already any particular character, it has no effect on that character.
1051 * @param the source string
1052 * @return the modified set, for chaining
1053 */
removeAll(const UnicodeString & s)1054 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1055 UnicodeSet set;
1056 set.addAll(s);
1057 removeAll(set);
1058 return *this;
1059 }
1060
removeAllStrings()1061 UnicodeSet& UnicodeSet::removeAllStrings() {
1062 if (!isFrozen() && hasStrings()) {
1063 strings->removeAllElements();
1064 releasePattern();
1065 }
1066 return *this;
1067 }
1068
1069
1070 /**
1071 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1072 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1073 * @param the source string
1074 * @return a newly created set containing the given string
1075 */
createFrom(const UnicodeString & s)1076 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1077 UnicodeSet *set = new UnicodeSet();
1078 if (set != nullptr) { // Check for memory allocation error.
1079 set->add(s);
1080 }
1081 return set;
1082 }
1083
1084
1085 /**
1086 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1087 * @param the source string
1088 * @return a newly created set containing the given characters
1089 */
createFromAll(const UnicodeString & s)1090 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1091 UnicodeSet *set = new UnicodeSet();
1092 if (set != nullptr) { // Check for memory allocation error.
1093 set->addAll(s);
1094 }
1095 return set;
1096 }
1097
1098 /**
1099 * Retain only the elements in this set that are contained in the
1100 * specified range. If <code>end > start</code> then an empty range is
1101 * retained, leaving the set empty.
1102 *
1103 * @param start first character, inclusive, of range to be retained
1104 * to this set.
1105 * @param end last character, inclusive, of range to be retained
1106 * to this set.
1107 */
retain(UChar32 start,UChar32 end)1108 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1109 if (pinCodePoint(start) <= pinCodePoint(end)) {
1110 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1111 retain(range, 2, 0);
1112 } else {
1113 clear();
1114 }
1115 return *this;
1116 }
1117
retain(UChar32 c)1118 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1119 return retain(c, c);
1120 }
1121
retain(const UnicodeString & s)1122 UnicodeSet& UnicodeSet::retain(const UnicodeString &s) {
1123 if (isFrozen() || isBogus()) { return *this; }
1124 UChar32 cp = getSingleCP(s);
1125 if (cp < 0) {
1126 bool isIn = stringsContains(s);
1127 // Check for getRangeCount() first to avoid somewhat-expensive size()
1128 // when there are single code points.
1129 if (isIn && getRangeCount() == 0 && size() == 1) {
1130 return *this;
1131 }
1132 clear();
1133 if (isIn) {
1134 _add(s);
1135 }
1136 } else {
1137 retain(cp, cp);
1138 }
1139 return *this;
1140 }
1141
1142 /**
1143 * Removes the specified range from this set if it is present.
1144 * The set will not contain the specified range once the call
1145 * returns. If <code>end > start</code> then an empty range is
1146 * removed, leaving the set unchanged.
1147 *
1148 * @param start first character, inclusive, of range to be removed
1149 * from this set.
1150 * @param end last character, inclusive, of range to be removed
1151 * from this set.
1152 */
remove(UChar32 start,UChar32 end)1153 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1154 if (pinCodePoint(start) <= pinCodePoint(end)) {
1155 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1156 retain(range, 2, 2);
1157 }
1158 return *this;
1159 }
1160
1161 /**
1162 * Removes the specified character from this set if it is present.
1163 * The set will not contain the specified range once the call
1164 * returns.
1165 */
remove(UChar32 c)1166 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1167 return remove(c, c);
1168 }
1169
1170 /**
1171 * Removes the specified string from this set if it is present.
1172 * The set will not contain the specified character once the call
1173 * returns.
1174 * @param the source string
1175 * @return the modified set, for chaining
1176 */
remove(const UnicodeString & s)1177 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1178 if (isFrozen() || isBogus()) return *this;
1179 int32_t cp = getSingleCP(s);
1180 if (cp < 0) {
1181 if (strings != nullptr && strings->removeElement((void*) &s)) {
1182 releasePattern();
1183 }
1184 } else {
1185 remove((UChar32)cp, (UChar32)cp);
1186 }
1187 return *this;
1188 }
1189
1190 /**
1191 * Complements the specified range in this set. Any character in
1192 * the range will be removed if it is in this set, or will be
1193 * added if it is not in this set. If <code>end > start</code>
1194 * then an empty range is xor'ed, leaving the set unchanged.
1195 *
1196 * @param start first character, inclusive, of range to be removed
1197 * from this set.
1198 * @param end last character, inclusive, of range to be removed
1199 * from this set.
1200 */
complement(UChar32 start,UChar32 end)1201 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1202 if (isFrozen() || isBogus()) {
1203 return *this;
1204 }
1205 if (pinCodePoint(start) <= pinCodePoint(end)) {
1206 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1207 exclusiveOr(range, 2, 0);
1208 }
1209 releasePattern();
1210 return *this;
1211 }
1212
complement(UChar32 c)1213 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1214 return complement(c, c);
1215 }
1216
1217 /**
1218 * This is equivalent to
1219 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1220 */
complement()1221 UnicodeSet& UnicodeSet::complement() {
1222 if (isFrozen() || isBogus()) {
1223 return *this;
1224 }
1225 if (list[0] == UNICODESET_LOW) {
1226 uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
1227 --len;
1228 } else {
1229 if (!ensureCapacity(len+1)) {
1230 return *this;
1231 }
1232 uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
1233 list[0] = UNICODESET_LOW;
1234 ++len;
1235 }
1236 releasePattern();
1237 return *this;
1238 }
1239
1240 /**
1241 * Complement the specified string in this set.
1242 * The set will not contain the specified string once the call
1243 * returns.
1244 *
1245 * @param s the string to complement
1246 * @return this object, for chaining
1247 */
complement(const UnicodeString & s)1248 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1249 if (isFrozen() || isBogus()) return *this;
1250 int32_t cp = getSingleCP(s);
1251 if (cp < 0) {
1252 if (stringsContains(s)) {
1253 strings->removeElement((void*) &s);
1254 } else {
1255 _add(s);
1256 }
1257 releasePattern();
1258 } else {
1259 complement((UChar32)cp, (UChar32)cp);
1260 }
1261 return *this;
1262 }
1263
1264 /**
1265 * Adds all of the elements in the specified set to this set if
1266 * they're not already present. This operation effectively
1267 * modifies this set so that its value is the <i>union</i> of the two
1268 * sets. The behavior of this operation is unspecified if the specified
1269 * collection is modified while the operation is in progress.
1270 *
1271 * @param c set whose elements are to be added to this set.
1272 * @see #add(char, char)
1273 */
addAll(const UnicodeSet & c)1274 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1275 if ( c.len>0 && c.list!=nullptr ) {
1276 add(c.list, c.len, 0);
1277 }
1278
1279 // Add strings in order
1280 if ( c.strings!=nullptr ) {
1281 for (int32_t i=0; i<c.strings->size(); ++i) {
1282 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1283 if (!stringsContains(*s)) {
1284 _add(*s);
1285 }
1286 }
1287 }
1288 return *this;
1289 }
1290
1291 /**
1292 * Retains only the elements in this set that are contained in the
1293 * specified set. In other words, removes from this set all of
1294 * its elements that are not contained in the specified set. This
1295 * operation effectively modifies this set so that its value is
1296 * the <i>intersection</i> of the two sets.
1297 *
1298 * @param c set that defines which elements this set will retain.
1299 */
retainAll(const UnicodeSet & c)1300 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1301 if (isFrozen() || isBogus()) {
1302 return *this;
1303 }
1304 retain(c.list, c.len, 0);
1305 if (hasStrings()) {
1306 if (!c.hasStrings()) {
1307 strings->removeAllElements();
1308 } else {
1309 strings->retainAll(*c.strings);
1310 }
1311 }
1312 return *this;
1313 }
1314
1315 /**
1316 * Removes from this set all of its elements that are contained in the
1317 * specified set. This operation effectively modifies this
1318 * set so that its value is the <i>asymmetric set difference</i> of
1319 * the two sets.
1320 *
1321 * @param c set that defines which elements will be removed from
1322 * this set.
1323 */
removeAll(const UnicodeSet & c)1324 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1325 if (isFrozen() || isBogus()) {
1326 return *this;
1327 }
1328 retain(c.list, c.len, 2);
1329 if (hasStrings() && c.hasStrings()) {
1330 strings->removeAll(*c.strings);
1331 }
1332 return *this;
1333 }
1334
1335 /**
1336 * Complements in this set all elements contained in the specified
1337 * set. Any character in the other set will be removed if it is
1338 * in this set, or will be added if it is not in this set.
1339 *
1340 * @param c set that defines which elements will be xor'ed from
1341 * this set.
1342 */
complementAll(const UnicodeSet & c)1343 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1344 if (isFrozen() || isBogus()) {
1345 return *this;
1346 }
1347 exclusiveOr(c.list, c.len, 0);
1348
1349 if (c.strings != nullptr) {
1350 for (int32_t i=0; i<c.strings->size(); ++i) {
1351 void* e = c.strings->elementAt(i);
1352 if (strings == nullptr || !strings->removeElement(e)) {
1353 _add(*(const UnicodeString*)e);
1354 }
1355 }
1356 }
1357 return *this;
1358 }
1359
1360 /**
1361 * Removes all of the elements from this set. This set will be
1362 * empty after this call returns.
1363 */
clear()1364 UnicodeSet& UnicodeSet::clear() {
1365 if (isFrozen()) {
1366 return *this;
1367 }
1368 list[0] = UNICODESET_HIGH;
1369 len = 1;
1370 releasePattern();
1371 if (strings != nullptr) {
1372 strings->removeAllElements();
1373 }
1374 // Remove bogus
1375 fFlags = 0;
1376 return *this;
1377 }
1378
1379 /**
1380 * Iteration method that returns the number of ranges contained in
1381 * this set.
1382 * @see #getRangeStart
1383 * @see #getRangeEnd
1384 */
getRangeCount() const1385 int32_t UnicodeSet::getRangeCount() const {
1386 return len/2;
1387 }
1388
1389 /**
1390 * Iteration method that returns the first character in the
1391 * specified range of this set.
1392 * @see #getRangeCount
1393 * @see #getRangeEnd
1394 */
getRangeStart(int32_t index) const1395 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1396 return list[index*2];
1397 }
1398
1399 /**
1400 * Iteration method that returns the last character in the
1401 * specified range of this set.
1402 * @see #getRangeStart
1403 * @see #getRangeEnd
1404 */
getRangeEnd(int32_t index) const1405 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1406 return list[index*2 + 1] - 1;
1407 }
1408
getString(int32_t index) const1409 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1410 return (const UnicodeString*) strings->elementAt(index);
1411 }
1412
1413 /**
1414 * Reallocate this objects internal structures to take up the least
1415 * possible space, without changing this object's value.
1416 */
compact()1417 UnicodeSet& UnicodeSet::compact() {
1418 if (isFrozen() || isBogus()) {
1419 return *this;
1420 }
1421 // Delete buffer first to defragment memory less.
1422 if (buffer != stackList) {
1423 uprv_free(buffer);
1424 buffer = nullptr;
1425 bufferCapacity = 0;
1426 }
1427 if (list == stackList) {
1428 // pass
1429 } else if (len <= INITIAL_CAPACITY) {
1430 uprv_memcpy(stackList, list, len * sizeof(UChar32));
1431 uprv_free(list);
1432 list = stackList;
1433 capacity = INITIAL_CAPACITY;
1434 } else if ((len + 7) < capacity) {
1435 // If we have more than a little unused capacity, shrink it to len.
1436 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);
1437 if (temp) {
1438 list = temp;
1439 capacity = len;
1440 }
1441 // else what the heck happened?! We allocated less memory!
1442 // Oh well. We'll keep our original array.
1443 }
1444 if (strings != nullptr && strings->isEmpty()) {
1445 delete strings;
1446 strings = nullptr;
1447 }
1448 return *this;
1449 }
1450
1451 #ifdef DEBUG_SERIALIZE
1452 #include <stdio.h>
1453 #endif
1454
1455 /**
1456 * Deserialize constructor.
1457 */
UnicodeSet(const uint16_t data[],int32_t dataLen,ESerialization serialization,UErrorCode & ec)1458 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1459 UErrorCode &ec) {
1460
1461 if(U_FAILURE(ec)) {
1462 setToBogus();
1463 return;
1464 }
1465
1466 if( (serialization != kSerialized)
1467 || (data==nullptr)
1468 || (dataLen < 1)) {
1469 ec = U_ILLEGAL_ARGUMENT_ERROR;
1470 setToBogus();
1471 return;
1472 }
1473
1474 // bmp?
1475 int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1476 int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1477
1478 int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1479 #ifdef DEBUG_SERIALIZE
1480 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
1481 #endif
1482 if(!ensureCapacity(newLength + 1)) { // +1 for HIGH
1483 return;
1484 }
1485 // copy bmp
1486 int32_t i;
1487 for(i = 0; i< bmpLength;i++) {
1488 list[i] = data[i+headerSize];
1489 #ifdef DEBUG_SERIALIZE
1490 printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1491 #endif
1492 }
1493 // copy smp
1494 for(i=bmpLength;i<newLength;i++) {
1495 list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1496 ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1497 #ifdef DEBUG_SERIALIZE
1498 printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1499 #endif
1500 }
1501 U_ASSERT(i == newLength);
1502 if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
1503 list[i++] = UNICODESET_HIGH;
1504 }
1505 len = i;
1506 }
1507
1508
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1509 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1510 int32_t bmpLength, length, destLength;
1511
1512 if (U_FAILURE(ec)) {
1513 return 0;
1514 }
1515
1516 if (destCapacity<0 || (destCapacity>0 && dest==nullptr)) {
1517 ec=U_ILLEGAL_ARGUMENT_ERROR;
1518 return 0;
1519 }
1520
1521 /* count necessary 16-bit units */
1522 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1523 // assert(length>=0);
1524 if (length==0) {
1525 /* empty set */
1526 if (destCapacity>0) {
1527 *dest=0;
1528 } else {
1529 ec=U_BUFFER_OVERFLOW_ERROR;
1530 }
1531 return 1;
1532 }
1533 /* now length>0 */
1534
1535 if (this->list[length-1]<=0xffff) {
1536 /* all BMP */
1537 bmpLength=length;
1538 } else if (this->list[0]>=0x10000) {
1539 /* all supplementary */
1540 bmpLength=0;
1541 length*=2;
1542 } else {
1543 /* some BMP, some supplementary */
1544 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1545 length=bmpLength+2*(length-bmpLength);
1546 }
1547 #ifdef DEBUG_SERIALIZE
1548 printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1549 #endif
1550 /* length: number of 16-bit array units */
1551 if (length>0x7fff) {
1552 /* there are only 15 bits for the length in the first serialized word */
1553 ec=U_INDEX_OUTOFBOUNDS_ERROR;
1554 return 0;
1555 }
1556
1557 /*
1558 * total serialized length:
1559 * number of 16-bit array units (length) +
1560 * 1 length unit (always) +
1561 * 1 bmpLength unit (if there are supplementary values)
1562 */
1563 destLength=length+((length>bmpLength)?2:1);
1564 if (destLength<=destCapacity) {
1565 const UChar32 *p;
1566 int32_t i;
1567
1568 #ifdef DEBUG_SERIALIZE
1569 printf("writeHdr\n");
1570 #endif
1571 *dest=(uint16_t)length;
1572 if (length>bmpLength) {
1573 *dest|=0x8000;
1574 *++dest=(uint16_t)bmpLength;
1575 }
1576 ++dest;
1577
1578 /* write the BMP part of the array */
1579 p=this->list;
1580 for (i=0; i<bmpLength; ++i) {
1581 #ifdef DEBUG_SERIALIZE
1582 printf("writebmp: %x\n", (int)*p);
1583 #endif
1584 *dest++=(uint16_t)*p++;
1585 }
1586
1587 /* write the supplementary part of the array */
1588 for (; i<length; i+=2) {
1589 #ifdef DEBUG_SERIALIZE
1590 printf("write32: %x\n", (int)*p);
1591 #endif
1592 *dest++=(uint16_t)(*p>>16);
1593 *dest++=(uint16_t)*p++;
1594 }
1595 } else {
1596 ec=U_BUFFER_OVERFLOW_ERROR;
1597 }
1598 return destLength;
1599 }
1600
1601 //----------------------------------------------------------------
1602 // Implementation: Utility methods
1603 //----------------------------------------------------------------
1604
1605 /**
1606 * Allocate our strings vector and return true if successful.
1607 */
allocateStrings(UErrorCode & status)1608 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1609 if (U_FAILURE(status)) {
1610 return false;
1611 }
1612 strings = new UVector(uprv_deleteUObject,
1613 uhash_compareUnicodeString, 1, status);
1614 if (strings == nullptr) { // Check for memory allocation error.
1615 status = U_MEMORY_ALLOCATION_ERROR;
1616 return false;
1617 }
1618 if (U_FAILURE(status)) {
1619 delete strings;
1620 strings = nullptr;
1621 return false;
1622 }
1623 return true;
1624 }
1625
nextCapacity(int32_t minCapacity)1626 int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1627 // Grow exponentially to reduce the frequency of allocations.
1628 if (minCapacity < INITIAL_CAPACITY) {
1629 return minCapacity + INITIAL_CAPACITY;
1630 } else if (minCapacity <= 2500) {
1631 return 5 * minCapacity;
1632 } else {
1633 int32_t newCapacity = 2 * minCapacity;
1634 if (newCapacity > MAX_LENGTH) {
1635 newCapacity = MAX_LENGTH;
1636 }
1637 return newCapacity;
1638 }
1639 }
1640
ensureCapacity(int32_t newLen)1641 bool UnicodeSet::ensureCapacity(int32_t newLen) {
1642 if (newLen > MAX_LENGTH) {
1643 newLen = MAX_LENGTH;
1644 }
1645 if (newLen <= capacity) {
1646 return true;
1647 }
1648 int32_t newCapacity = nextCapacity(newLen);
1649 UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1650 if (temp == nullptr) {
1651 setToBogus(); // set the object to bogus state if an OOM failure occurred.
1652 return false;
1653 }
1654 // Copy only the actual contents.
1655 uprv_memcpy(temp, list, len * sizeof(UChar32));
1656 if (list != stackList) {
1657 uprv_free(list);
1658 }
1659 list = temp;
1660 capacity = newCapacity;
1661 return true;
1662 }
1663
ensureBufferCapacity(int32_t newLen)1664 bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1665 if (newLen > MAX_LENGTH) {
1666 newLen = MAX_LENGTH;
1667 }
1668 if (newLen <= bufferCapacity) {
1669 return true;
1670 }
1671 int32_t newCapacity = nextCapacity(newLen);
1672 UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1673 if (temp == nullptr) {
1674 setToBogus();
1675 return false;
1676 }
1677 // The buffer has no contents to be copied.
1678 // It is always filled from scratch after this call.
1679 if (buffer != stackList) {
1680 uprv_free(buffer);
1681 }
1682 buffer = temp;
1683 bufferCapacity = newCapacity;
1684 return true;
1685 }
1686
1687 /**
1688 * Swap list and buffer.
1689 */
swapBuffers()1690 void UnicodeSet::swapBuffers() {
1691 // swap list and buffer
1692 UChar32* temp = list;
1693 list = buffer;
1694 buffer = temp;
1695
1696 int32_t c = capacity;
1697 capacity = bufferCapacity;
1698 bufferCapacity = c;
1699 }
1700
setToBogus()1701 void UnicodeSet::setToBogus() {
1702 clear(); // Remove everything in the set.
1703 fFlags = kIsBogus;
1704 }
1705
1706 //----------------------------------------------------------------
1707 // Implementation: Fundamental operators
1708 //----------------------------------------------------------------
1709
max(UChar32 a,UChar32 b)1710 static inline UChar32 max(UChar32 a, UChar32 b) {
1711 return (a > b) ? a : b;
1712 }
1713
1714 // polarity = 0, 3 is normal: x xor y
1715 // polarity = 1, 2: x xor ~y == x === y
1716
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1717 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1718 if (isFrozen() || isBogus()) {
1719 return;
1720 }
1721 if (!ensureBufferCapacity(len + otherLen)) {
1722 return;
1723 }
1724
1725 int32_t i = 0, j = 0, k = 0;
1726 UChar32 a = list[i++];
1727 UChar32 b;
1728 if (polarity == 1 || polarity == 2) {
1729 b = UNICODESET_LOW;
1730 if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1731 ++j;
1732 b = other[j];
1733 }
1734 } else {
1735 b = other[j++];
1736 }
1737 // simplest of all the routines
1738 // sort the values, discarding identicals!
1739 for (;;) {
1740 if (a < b) {
1741 buffer[k++] = a;
1742 a = list[i++];
1743 } else if (b < a) {
1744 buffer[k++] = b;
1745 b = other[j++];
1746 } else if (a != UNICODESET_HIGH) { // at this point, a == b
1747 // discard both values!
1748 a = list[i++];
1749 b = other[j++];
1750 } else { // DONE!
1751 buffer[k++] = UNICODESET_HIGH;
1752 len = k;
1753 break;
1754 }
1755 }
1756 swapBuffers();
1757 releasePattern();
1758 }
1759
1760 // polarity = 0 is normal: x union y
1761 // polarity = 2: x union ~y
1762 // polarity = 1: ~x union y
1763 // polarity = 3: ~x union ~y
1764
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1765 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1766 if (isFrozen() || isBogus() || other==nullptr) {
1767 return;
1768 }
1769 if (!ensureBufferCapacity(len + otherLen)) {
1770 return;
1771 }
1772
1773 int32_t i = 0, j = 0, k = 0;
1774 UChar32 a = list[i++];
1775 UChar32 b = other[j++];
1776 // change from xor is that we have to check overlapping pairs
1777 // polarity bit 1 means a is second, bit 2 means b is.
1778 for (;;) {
1779 switch (polarity) {
1780 case 0: // both first; take lower if unequal
1781 if (a < b) { // take a
1782 // Back up over overlapping ranges in buffer[]
1783 if (k > 0 && a <= buffer[k-1]) {
1784 // Pick latter end value in buffer[] vs. list[]
1785 a = max(list[i], buffer[--k]);
1786 } else {
1787 // No overlap
1788 buffer[k++] = a;
1789 a = list[i];
1790 }
1791 i++; // Common if/else code factored out
1792 polarity ^= 1;
1793 } else if (b < a) { // take b
1794 if (k > 0 && b <= buffer[k-1]) {
1795 b = max(other[j], buffer[--k]);
1796 } else {
1797 buffer[k++] = b;
1798 b = other[j];
1799 }
1800 j++;
1801 polarity ^= 2;
1802 } else { // a == b, take a, drop b
1803 if (a == UNICODESET_HIGH) goto loop_end;
1804 // This is symmetrical; it doesn't matter if
1805 // we backtrack with a or b. - liu
1806 if (k > 0 && a <= buffer[k-1]) {
1807 a = max(list[i], buffer[--k]);
1808 } else {
1809 // No overlap
1810 buffer[k++] = a;
1811 a = list[i];
1812 }
1813 i++;
1814 polarity ^= 1;
1815 b = other[j++];
1816 polarity ^= 2;
1817 }
1818 break;
1819 case 3: // both second; take higher if unequal, and drop other
1820 if (b <= a) { // take a
1821 if (a == UNICODESET_HIGH) goto loop_end;
1822 buffer[k++] = a;
1823 } else { // take b
1824 if (b == UNICODESET_HIGH) goto loop_end;
1825 buffer[k++] = b;
1826 }
1827 a = list[i++];
1828 polarity ^= 1; // factored common code
1829 b = other[j++];
1830 polarity ^= 2;
1831 break;
1832 case 1: // a second, b first; if b < a, overlap
1833 if (a < b) { // no overlap, take a
1834 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1835 } else if (b < a) { // OVERLAP, drop b
1836 b = other[j++];
1837 polarity ^= 2;
1838 } else { // a == b, drop both!
1839 if (a == UNICODESET_HIGH) goto loop_end;
1840 a = list[i++];
1841 polarity ^= 1;
1842 b = other[j++];
1843 polarity ^= 2;
1844 }
1845 break;
1846 case 2: // a first, b second; if a < b, overlap
1847 if (b < a) { // no overlap, take b
1848 buffer[k++] = b;
1849 b = other[j++];
1850 polarity ^= 2;
1851 } else if (a < b) { // OVERLAP, drop a
1852 a = list[i++];
1853 polarity ^= 1;
1854 } else { // a == b, drop both!
1855 if (a == UNICODESET_HIGH) goto loop_end;
1856 a = list[i++];
1857 polarity ^= 1;
1858 b = other[j++];
1859 polarity ^= 2;
1860 }
1861 break;
1862 }
1863 }
1864 loop_end:
1865 buffer[k++] = UNICODESET_HIGH; // terminate
1866 len = k;
1867 swapBuffers();
1868 releasePattern();
1869 }
1870
1871 // polarity = 0 is normal: x intersect y
1872 // polarity = 2: x intersect ~y == set-minus
1873 // polarity = 1: ~x intersect y
1874 // polarity = 3: ~x intersect ~y
1875
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1876 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1877 if (isFrozen() || isBogus()) {
1878 return;
1879 }
1880 if (!ensureBufferCapacity(len + otherLen)) {
1881 return;
1882 }
1883
1884 int32_t i = 0, j = 0, k = 0;
1885 UChar32 a = list[i++];
1886 UChar32 b = other[j++];
1887 // change from xor is that we have to check overlapping pairs
1888 // polarity bit 1 means a is second, bit 2 means b is.
1889 for (;;) {
1890 switch (polarity) {
1891 case 0: // both first; drop the smaller
1892 if (a < b) { // drop a
1893 a = list[i++];
1894 polarity ^= 1;
1895 } else if (b < a) { // drop b
1896 b = other[j++];
1897 polarity ^= 2;
1898 } else { // a == b, take one, drop other
1899 if (a == UNICODESET_HIGH) goto loop_end;
1900 buffer[k++] = a;
1901 a = list[i++];
1902 polarity ^= 1;
1903 b = other[j++];
1904 polarity ^= 2;
1905 }
1906 break;
1907 case 3: // both second; take lower if unequal
1908 if (a < b) { // take a
1909 buffer[k++] = a;
1910 a = list[i++];
1911 polarity ^= 1;
1912 } else if (b < a) { // take b
1913 buffer[k++] = b;
1914 b = other[j++];
1915 polarity ^= 2;
1916 } else { // a == b, take one, drop other
1917 if (a == UNICODESET_HIGH) goto loop_end;
1918 buffer[k++] = a;
1919 a = list[i++];
1920 polarity ^= 1;
1921 b = other[j++];
1922 polarity ^= 2;
1923 }
1924 break;
1925 case 1: // a second, b first;
1926 if (a < b) { // NO OVERLAP, drop a
1927 a = list[i++];
1928 polarity ^= 1;
1929 } else if (b < a) { // OVERLAP, take b
1930 buffer[k++] = b;
1931 b = other[j++];
1932 polarity ^= 2;
1933 } else { // a == b, drop both!
1934 if (a == UNICODESET_HIGH) goto loop_end;
1935 a = list[i++];
1936 polarity ^= 1;
1937 b = other[j++];
1938 polarity ^= 2;
1939 }
1940 break;
1941 case 2: // a first, b second; if a < b, overlap
1942 if (b < a) { // no overlap, drop b
1943 b = other[j++];
1944 polarity ^= 2;
1945 } else if (a < b) { // OVERLAP, take a
1946 buffer[k++] = a;
1947 a = list[i++];
1948 polarity ^= 1;
1949 } else { // a == b, drop both!
1950 if (a == UNICODESET_HIGH) goto loop_end;
1951 a = list[i++];
1952 polarity ^= 1;
1953 b = other[j++];
1954 polarity ^= 2;
1955 }
1956 break;
1957 }
1958 }
1959 loop_end:
1960 buffer[k++] = UNICODESET_HIGH; // terminate
1961 len = k;
1962 swapBuffers();
1963 releasePattern();
1964 }
1965
1966 /**
1967 * Append the <code>toPattern()</code> representation of a
1968 * string to the given <code>StringBuffer</code>.
1969 */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1970 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable) {
1971 UChar32 cp;
1972 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1973 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1974 }
1975 }
1976
1977 /**
1978 * Append the <code>toPattern()</code> representation of a
1979 * character to the given <code>StringBuffer</code>.
1980 */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1981 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable) {
1982 if (escapeUnprintable ? ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
1983 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1984 // unprintable
1985 ICU_Utility::escape(buf, c);
1986 return;
1987 }
1988 // Okay to let ':' pass through
1989 switch (c) {
1990 case u'[':
1991 case u']':
1992 case u'-':
1993 case u'^':
1994 case u'&':
1995 case u'\\':
1996 case u'{':
1997 case u'}':
1998 case u':':
1999 case SymbolTable::SYMBOL_REF:
2000 buf.append(u'\\');
2001 break;
2002 default:
2003 // Escape whitespace
2004 if (PatternProps::isWhiteSpace(c)) {
2005 buf.append(u'\\');
2006 }
2007 break;
2008 }
2009 buf.append(c);
2010 }
2011
_appendToPat(UnicodeString & result,UChar32 start,UChar32 end,UBool escapeUnprintable)2012 void UnicodeSet::_appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
2013 UBool escapeUnprintable) {
2014 _appendToPat(result, start, escapeUnprintable);
2015 if (start != end) {
2016 if ((start+1) != end ||
2017 // Avoid writing what looks like a lead+trail surrogate pair.
2018 start == 0xdbff) {
2019 result.append(u'-');
2020 }
2021 _appendToPat(result, end, escapeUnprintable);
2022 }
2023 }
2024
2025 /**
2026 * Append a string representation of this set to result. This will be
2027 * a cleaned version of the string passed to applyPattern(), if there
2028 * is one. Otherwise it will be generated.
2029 */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const2030 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2031 UBool escapeUnprintable) const
2032 {
2033 if (pat != nullptr) {
2034 int32_t i;
2035 int32_t backslashCount = 0;
2036 for (i=0; i<patLen; ) {
2037 UChar32 c;
2038 U16_NEXT(pat, i, patLen, c);
2039 if (escapeUnprintable ?
2040 ICU_Utility::isUnprintable(c) : ICU_Utility::shouldAlwaysBeEscaped(c)) {
2041 // If the unprintable character is preceded by an odd
2042 // number of backslashes, then it has been escaped.
2043 // Before unescaping it, we delete the final
2044 // backslash.
2045 if ((backslashCount % 2) == 1) {
2046 result.truncate(result.length() - 1);
2047 }
2048 ICU_Utility::escape(result, c);
2049 backslashCount = 0;
2050 } else {
2051 result.append(c);
2052 if (c == u'\\') {
2053 ++backslashCount;
2054 } else {
2055 backslashCount = 0;
2056 }
2057 }
2058 }
2059 return result;
2060 }
2061
2062 return _generatePattern(result, escapeUnprintable);
2063 }
2064
2065 /**
2066 * Returns a string representation of this set. If the result of
2067 * calling this function is passed to a UnicodeSet constructor, it
2068 * will produce another set that is equal to this one.
2069 */
toPattern(UnicodeString & result,UBool escapeUnprintable) const2070 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2071 UBool escapeUnprintable) const
2072 {
2073 result.truncate(0);
2074 return _toPattern(result, escapeUnprintable);
2075 }
2076
2077 /**
2078 * Generate and append a string representation of this set to result.
2079 * This does not use this.pat, the cleaned up copy of the string
2080 * passed to applyPattern().
2081 */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const2082 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2083 UBool escapeUnprintable) const
2084 {
2085 result.append(u'[');
2086
2087 int32_t i = 0;
2088 int32_t limit = len & ~1; // = 2 * getRangeCount()
2089
2090 // If the set contains at least 2 intervals and includes both
2091 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2092 // be more economical.
2093 // if (getRangeCount() >= 2 &&
2094 // getRangeStart(0) == MIN_VALUE &&
2095 // getRangeEnd(last) == MAX_VALUE)
2096 // Invariant: list[len-1] == HIGH == MAX_VALUE + 1
2097 // If limit == len then len is even and the last range ends with MAX_VALUE.
2098 //
2099 // *But* do not write the inverse (complement) if there are strings.
2100 // Since ICU 70, the '^' performs a code point complement which removes all strings.
2101 if (len >= 4 && list[0] == 0 && limit == len && !hasStrings()) {
2102 // Emit the inverse
2103 result.append(u'^');
2104 // Offsetting the inversion list index by one lets us
2105 // iterate over the ranges of the set complement.
2106 i = 1;
2107 --limit;
2108 }
2109
2110 // Emit the ranges as pairs.
2111 while (i < limit) {
2112 UChar32 start = list[i]; // getRangeStart()
2113 UChar32 end = list[i + 1] - 1; // getRangeEnd() = range limit minus one
2114 if (!(0xd800 <= end && end <= 0xdbff)) {
2115 _appendToPat(result, start, end, escapeUnprintable);
2116 i += 2;
2117 } else {
2118 // The range ends with a lead surrogate.
2119 // Avoid writing what looks like a lead+trail surrogate pair.
2120 // 1. Postpone ranges that start with a lead surrogate code point.
2121 int32_t firstLead = i;
2122 while ((i += 2) < limit && list[i] <= 0xdbff) {}
2123 int32_t firstAfterLead = i;
2124 // 2. Write following ranges that start with a trail surrogate code point.
2125 while (i < limit && (start = list[i]) <= 0xdfff) {
2126 _appendToPat(result, start, list[i + 1] - 1, escapeUnprintable);
2127 i += 2;
2128 }
2129 // 3. Now write the postponed ranges.
2130 for (int j = firstLead; j < firstAfterLead; j += 2) {
2131 _appendToPat(result, list[j], list[j + 1] - 1, escapeUnprintable);
2132 }
2133 }
2134 }
2135
2136 if (strings != nullptr) {
2137 for (int32_t i = 0; i<strings->size(); ++i) {
2138 result.append(u'{');
2139 _appendToPat(result,
2140 *(const UnicodeString*) strings->elementAt(i),
2141 escapeUnprintable);
2142 result.append(u'}');
2143 }
2144 }
2145 return result.append(u']');
2146 }
2147
2148 /**
2149 * Release existing cached pattern
2150 */
releasePattern()2151 void UnicodeSet::releasePattern() {
2152 if (pat) {
2153 uprv_free(pat);
2154 pat = nullptr;
2155 patLen = 0;
2156 }
2157 }
2158
2159 /**
2160 * Set the new pattern to cache.
2161 */
setPattern(const char16_t * newPat,int32_t newPatLen)2162 void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2163 releasePattern();
2164 pat = (char16_t *)uprv_malloc((newPatLen + 1) * sizeof(char16_t));
2165 if (pat) {
2166 patLen = newPatLen;
2167 u_memcpy(pat, newPat, patLen);
2168 pat[patLen] = 0;
2169 }
2170 // else we don't care if malloc failed. This was just a nice cache.
2171 // We can regenerate an equivalent pattern later when requested.
2172 }
2173
freeze()2174 UnicodeSet *UnicodeSet::freeze() {
2175 if(!isFrozen() && !isBogus()) {
2176 compact();
2177
2178 // Optimize contains() and span() and similar functions.
2179 if (hasStrings()) {
2180 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2181 if (stringSpan == nullptr) {
2182 setToBogus();
2183 return this;
2184 } else if (!stringSpan->needsStringSpanUTF16()) {
2185 // All strings are irrelevant for span() etc. because
2186 // all of each string's code points are contained in this set.
2187 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2188 // many relevant strings as UTF-16.
2189 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2190 delete stringSpan;
2191 stringSpan = nullptr;
2192 }
2193 }
2194 if (stringSpan == nullptr) {
2195 // No span-relevant strings: Optimize for code point spans.
2196 bmpSet=new BMPSet(list, len);
2197 if (bmpSet == nullptr) { // Check for memory allocation error.
2198 setToBogus();
2199 }
2200 }
2201 }
2202 return this;
2203 }
2204
span(const char16_t * s,int32_t length,USetSpanCondition spanCondition) const2205 int32_t UnicodeSet::span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
2206 if(length>0 && bmpSet!=nullptr) {
2207 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2208 }
2209 if(length<0) {
2210 length=u_strlen(s);
2211 }
2212 if(length==0) {
2213 return 0;
2214 }
2215 if(stringSpan!=nullptr) {
2216 return stringSpan->span(s, length, spanCondition);
2217 } else if(hasStrings()) {
2218 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2219 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2220 UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2221 UnicodeSetStringSpan strSpan(*this, *strings, which);
2222 if(strSpan.needsStringSpanUTF16()) {
2223 return strSpan.span(s, length, spanCondition);
2224 }
2225 }
2226
2227 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2228 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2229 }
2230
2231 UChar32 c;
2232 int32_t start=0, prev=0;
2233 do {
2234 U16_NEXT(s, start, length, c);
2235 if(spanCondition!=contains(c)) {
2236 break;
2237 }
2238 } while((prev=start)<length);
2239 return prev;
2240 }
2241
spanBack(const char16_t * s,int32_t length,USetSpanCondition spanCondition) const2242 int32_t UnicodeSet::spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const {
2243 if(length>0 && bmpSet!=nullptr) {
2244 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2245 }
2246 if(length<0) {
2247 length=u_strlen(s);
2248 }
2249 if(length==0) {
2250 return 0;
2251 }
2252 if(stringSpan!=nullptr) {
2253 return stringSpan->spanBack(s, length, spanCondition);
2254 } else if(hasStrings()) {
2255 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2256 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2257 UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2258 UnicodeSetStringSpan strSpan(*this, *strings, which);
2259 if(strSpan.needsStringSpanUTF16()) {
2260 return strSpan.spanBack(s, length, spanCondition);
2261 }
2262 }
2263
2264 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2265 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2266 }
2267
2268 UChar32 c;
2269 int32_t prev=length;
2270 do {
2271 U16_PREV(s, 0, length, c);
2272 if(spanCondition!=contains(c)) {
2273 break;
2274 }
2275 } while((prev=length)>0);
2276 return prev;
2277 }
2278
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2279 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2280 if(length>0 && bmpSet!=nullptr) {
2281 const uint8_t *s0=(const uint8_t *)s;
2282 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2283 }
2284 if(length<0) {
2285 length=(int32_t)uprv_strlen(s);
2286 }
2287 if(length==0) {
2288 return 0;
2289 }
2290 if(stringSpan!=nullptr) {
2291 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2292 } else if(hasStrings()) {
2293 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2294 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2295 UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2296 UnicodeSetStringSpan strSpan(*this, *strings, which);
2297 if(strSpan.needsStringSpanUTF8()) {
2298 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2299 }
2300 }
2301
2302 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2303 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2304 }
2305
2306 UChar32 c;
2307 int32_t start=0, prev=0;
2308 do {
2309 U8_NEXT_OR_FFFD(s, start, length, c);
2310 if(spanCondition!=contains(c)) {
2311 break;
2312 }
2313 } while((prev=start)<length);
2314 return prev;
2315 }
2316
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2317 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2318 if(length>0 && bmpSet!=nullptr) {
2319 const uint8_t *s0=(const uint8_t *)s;
2320 return bmpSet->spanBackUTF8(s0, length, spanCondition);
2321 }
2322 if(length<0) {
2323 length=(int32_t)uprv_strlen(s);
2324 }
2325 if(length==0) {
2326 return 0;
2327 }
2328 if(stringSpan!=nullptr) {
2329 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2330 } else if(hasStrings()) {
2331 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2332 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2333 UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2334 UnicodeSetStringSpan strSpan(*this, *strings, which);
2335 if(strSpan.needsStringSpanUTF8()) {
2336 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2337 }
2338 }
2339
2340 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2341 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2342 }
2343
2344 UChar32 c;
2345 int32_t prev=length;
2346 do {
2347 U8_PREV_OR_FFFD(s, 0, length, c);
2348 if(spanCondition!=contains(c)) {
2349 break;
2350 }
2351 } while((prev=length)>0);
2352 return prev;
2353 }
2354
2355 U_NAMESPACE_END
2356