1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 1999-2015, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 **********************************************************************
11 */
12
13 #include "unicode/utypes.h"
14 #include "unicode/parsepos.h"
15 #include "unicode/symtable.h"
16 #include "unicode/uniset.h"
17 #include "unicode/ustring.h"
18 #include "unicode/utf8.h"
19 #include "unicode/utf16.h"
20 #include "ruleiter.h"
21 #include "cmemory.h"
22 #include "cstring.h"
23 #include "patternprops.h"
24 #include "uelement.h"
25 #include "util.h"
26 #include "uvector.h"
27 #include "charstr.h"
28 #include "ustrfmt.h"
29 #include "uassert.h"
30 #include "bmpset.h"
31 #include "unisetspan.h"
32
33 // Define UChar constants using hex for EBCDIC compatibility
34 // Used #define to reduce private static exports and memory access time.
35 #define SET_OPEN ((UChar)0x005B) /*[*/
36 #define SET_CLOSE ((UChar)0x005D) /*]*/
37 #define HYPHEN ((UChar)0x002D) /*-*/
38 #define COMPLEMENT ((UChar)0x005E) /*^*/
39 #define COLON ((UChar)0x003A) /*:*/
40 #define BACKSLASH ((UChar)0x005C) /*\*/
41 #define INTERSECTION ((UChar)0x0026) /*&*/
42 #define UPPER_U ((UChar)0x0055) /*U*/
43 #define LOWER_U ((UChar)0x0075) /*u*/
44 #define OPEN_BRACE ((UChar)123) /*{*/
45 #define CLOSE_BRACE ((UChar)125) /*}*/
46 #define UPPER_P ((UChar)0x0050) /*P*/
47 #define LOWER_P ((UChar)0x0070) /*p*/
48 #define UPPER_N ((UChar)78) /*N*/
49 #define EQUALS ((UChar)0x003D) /*=*/
50
51 // HIGH_VALUE > all valid values. 110000 for codepoints
52 #define UNICODESET_HIGH 0x0110000
53
54 // LOW <= all valid values. ZERO for codepoints
55 #define UNICODESET_LOW 0x000000
56
57 /** Max list [0, 1, 2, ..., max code point, HIGH] */
58 constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1;
59
60 U_NAMESPACE_BEGIN
61
~SymbolTable()62 SymbolTable::~SymbolTable() {}
63
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)64 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeSet)
65
66 /**
67 * Modify the given UChar32 variable so that it is in range, by
68 * pinning values < UNICODESET_LOW to UNICODESET_LOW, and
69 * pinning values > UNICODESET_HIGH-1 to UNICODESET_HIGH-1.
70 * It modifies its argument in-place and also returns it.
71 */
72 static inline UChar32 pinCodePoint(UChar32& c) {
73 if (c < UNICODESET_LOW) {
74 c = UNICODESET_LOW;
75 } else if (c > (UNICODESET_HIGH-1)) {
76 c = (UNICODESET_HIGH-1);
77 }
78 return c;
79 }
80
81 //----------------------------------------------------------------
82 // Debugging
83 //----------------------------------------------------------------
84
85 // DO NOT DELETE THIS CODE. This code is used to debug memory leaks.
86 // To enable the debugging, define the symbol DEBUG_MEM in the line
87 // below. This will result in text being sent to stdout that looks
88 // like this:
89 // DEBUG UnicodeSet: ct 0x00A39B20; 397 [\u0A81-\u0A83\u0A85-
90 // DEBUG UnicodeSet: dt 0x00A39B20; 396 [\u0A81-\u0A83\u0A85-
91 // Each line lists a construction (ct) or destruction (dt) event, the
92 // object address, the number of outstanding objects after the event,
93 // and the pattern of the object in question.
94
95 // #define DEBUG_MEM
96
97 #ifdef DEBUG_MEM
98 #include <stdio.h>
99 static int32_t _dbgCount = 0;
100
_dbgct(UnicodeSet * set)101 static inline void _dbgct(UnicodeSet* set) {
102 UnicodeString str;
103 set->toPattern(str, TRUE);
104 char buf[40];
105 str.extract(0, 39, buf, "");
106 printf("DEBUG UnicodeSet: ct 0x%08X; %d %s\n", set, ++_dbgCount, buf);
107 }
108
_dbgdt(UnicodeSet * set)109 static inline void _dbgdt(UnicodeSet* set) {
110 UnicodeString str;
111 set->toPattern(str, TRUE);
112 char buf[40];
113 str.extract(0, 39, buf, "");
114 printf("DEBUG UnicodeSet: dt 0x%08X; %d %s\n", set, --_dbgCount, buf);
115 }
116
117 #else
118
119 #define _dbgct(set)
120 #define _dbgdt(set)
121
122 #endif
123
124 //----------------------------------------------------------------
125 // UnicodeString in UVector support
126 //----------------------------------------------------------------
127
cloneUnicodeString(UElement * dst,UElement * src)128 static void U_CALLCONV cloneUnicodeString(UElement *dst, UElement *src) {
129 dst->pointer = new UnicodeString(*(UnicodeString*)src->pointer);
130 }
131
compareUnicodeString(UElement t1,UElement t2)132 static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) {
133 const UnicodeString &a = *(const UnicodeString*)t1.pointer;
134 const UnicodeString &b = *(const UnicodeString*)t2.pointer;
135 return a.compare(b);
136 }
137
hasStrings() const138 UBool UnicodeSet::hasStrings() const {
139 return strings != nullptr && !strings->isEmpty();
140 }
141
stringsSize() const142 int32_t UnicodeSet::stringsSize() const {
143 return strings == nullptr ? 0 : strings->size();
144 }
145
stringsContains(const UnicodeString & s) const146 UBool UnicodeSet::stringsContains(const UnicodeString &s) const {
147 return strings != nullptr && strings->contains((void*) &s);
148 }
149
150 //----------------------------------------------------------------
151 // Constructors &c
152 //----------------------------------------------------------------
153
154 /**
155 * Constructs an empty set.
156 */
UnicodeSet()157 UnicodeSet::UnicodeSet() {
158 list[0] = UNICODESET_HIGH;
159 _dbgct(this);
160 }
161
162 /**
163 * Constructs a set containing the given range. If <code>end >
164 * start</code> then an empty set is created.
165 *
166 * @param start first character, inclusive, of range
167 * @param end last character, inclusive, of range
168 */
UnicodeSet(UChar32 start,UChar32 end)169 UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) {
170 list[0] = UNICODESET_HIGH;
171 add(start, end);
172 _dbgct(this);
173 }
174
175 /**
176 * Constructs a set that is identical to the given UnicodeSet.
177 */
UnicodeSet(const UnicodeSet & o)178 UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) {
179 *this = o;
180 _dbgct(this);
181 }
182
183 // Copy-construct as thawed.
UnicodeSet(const UnicodeSet & o,UBool)184 UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) {
185 if (ensureCapacity(o.len)) {
186 // *this = o except for bmpSet and stringSpan
187 len = o.len;
188 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
189 if (o.hasStrings()) {
190 UErrorCode status = U_ZERO_ERROR;
191 if (!allocateStrings(status) ||
192 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
193 setToBogus();
194 return;
195 }
196 }
197 if (o.pat) {
198 setPattern(o.pat, o.patLen);
199 }
200 _dbgct(this);
201 }
202 }
203
204 /**
205 * Destructs the set.
206 */
~UnicodeSet()207 UnicodeSet::~UnicodeSet() {
208 _dbgdt(this); // first!
209 if (list != stackList) {
210 uprv_free(list);
211 }
212 delete bmpSet;
213 if (buffer != stackList) {
214 uprv_free(buffer);
215 }
216 delete strings;
217 delete stringSpan;
218 releasePattern();
219 }
220
221 /**
222 * Assigns this object to be a copy of another.
223 */
operator =(const UnicodeSet & o)224 UnicodeSet& UnicodeSet::operator=(const UnicodeSet& o) {
225 return copyFrom(o, FALSE);
226 }
227
copyFrom(const UnicodeSet & o,UBool asThawed)228 UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) {
229 if (this == &o) {
230 return *this;
231 }
232 if (isFrozen()) {
233 return *this;
234 }
235 if (o.isBogus()) {
236 setToBogus();
237 return *this;
238 }
239 if (!ensureCapacity(o.len)) {
240 // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens.
241 return *this;
242 }
243 len = o.len;
244 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32));
245 if (o.bmpSet != nullptr && !asThawed) {
246 bmpSet = new BMPSet(*o.bmpSet, list, len);
247 if (bmpSet == NULL) { // Check for memory allocation error.
248 setToBogus();
249 return *this;
250 }
251 }
252 if (o.hasStrings()) {
253 UErrorCode status = U_ZERO_ERROR;
254 if ((strings == nullptr && !allocateStrings(status)) ||
255 (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) {
256 setToBogus();
257 return *this;
258 }
259 } else if (hasStrings()) {
260 strings->removeAllElements();
261 }
262 if (o.stringSpan != nullptr && !asThawed) {
263 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings);
264 if (stringSpan == NULL) { // Check for memory allocation error.
265 setToBogus();
266 return *this;
267 }
268 }
269 releasePattern();
270 if (o.pat) {
271 setPattern(o.pat, o.patLen);
272 }
273 return *this;
274 }
275
276 /**
277 * Returns a copy of this object. All UnicodeMatcher objects have
278 * to support cloning in order to allow classes using
279 * UnicodeMatchers, such as Transliterator, to implement cloning.
280 */
clone() const281 UnicodeSet* UnicodeSet::clone() const {
282 return new UnicodeSet(*this);
283 }
284
cloneAsThawed() const285 UnicodeSet *UnicodeSet::cloneAsThawed() const {
286 return new UnicodeSet(*this, TRUE);
287 }
288
289 /**
290 * Compares the specified object with this set for equality. Returns
291 * <tt>true</tt> if the two sets
292 * have the same size, and every member of the specified set is
293 * contained in this set (or equivalently, every member of this set is
294 * contained in the specified set).
295 *
296 * @param o set to be compared for equality with this set.
297 * @return <tt>true</tt> if the specified set is equal to this set.
298 */
operator ==(const UnicodeSet & o) const299 UBool UnicodeSet::operator==(const UnicodeSet& o) const {
300 if (len != o.len) return FALSE;
301 for (int32_t i = 0; i < len; ++i) {
302 if (list[i] != o.list[i]) return FALSE;
303 }
304 if (hasStrings() != o.hasStrings()) { return FALSE; }
305 if (hasStrings() && *strings != *o.strings) return FALSE;
306 return TRUE;
307 }
308
309 /**
310 * Returns the hash code value for this set.
311 *
312 * @return the hash code value for this set.
313 * @see Object#hashCode()
314 */
hashCode(void) const315 int32_t UnicodeSet::hashCode(void) const {
316 uint32_t result = static_cast<uint32_t>(len);
317 for (int32_t i = 0; i < len; ++i) {
318 result *= 1000003u;
319 result += list[i];
320 }
321 return static_cast<int32_t>(result);
322 }
323
324 //----------------------------------------------------------------
325 // Public API
326 //----------------------------------------------------------------
327
328 /**
329 * Returns the number of elements in this set (its cardinality),
330 * Note than the elements of a set may include both individual
331 * codepoints and strings.
332 *
333 * @return the number of elements in this set (its cardinality).
334 */
size(void) const335 int32_t UnicodeSet::size(void) const {
336 int32_t n = 0;
337 int32_t count = getRangeCount();
338 for (int32_t i = 0; i < count; ++i) {
339 n += getRangeEnd(i) - getRangeStart(i) + 1;
340 }
341 return n + stringsSize();
342 }
343
344 /**
345 * Returns <tt>true</tt> if this set contains no elements.
346 *
347 * @return <tt>true</tt> if this set contains no elements.
348 */
isEmpty(void) const349 UBool UnicodeSet::isEmpty(void) const {
350 return len == 1 && !hasStrings();
351 }
352
353 /**
354 * Returns true if this set contains the given character.
355 * @param c character to be checked for containment
356 * @return true if the test condition is met
357 */
contains(UChar32 c) const358 UBool UnicodeSet::contains(UChar32 c) const {
359 // Set i to the index of the start item greater than ch
360 // We know we will terminate without length test!
361 // LATER: for large sets, add binary search
362 //int32_t i = -1;
363 //for (;;) {
364 // if (c < list[++i]) break;
365 //}
366 if (bmpSet != NULL) {
367 return bmpSet->contains(c);
368 }
369 if (stringSpan != NULL) {
370 return stringSpan->contains(c);
371 }
372 if (c >= UNICODESET_HIGH) { // Don't need to check LOW bound
373 return FALSE;
374 }
375 int32_t i = findCodePoint(c);
376 return (UBool)(i & 1); // return true if odd
377 }
378
379 /**
380 * Returns the smallest value i such that c < list[i]. Caller
381 * must ensure that c is a legal value or this method will enter
382 * an infinite loop. This method performs a binary search.
383 * @param c a character in the range MIN_VALUE..MAX_VALUE
384 * inclusive
385 * @return the smallest integer i in the range 0..len-1,
386 * inclusive, such that c < list[i]
387 */
findCodePoint(UChar32 c) const388 int32_t UnicodeSet::findCodePoint(UChar32 c) const {
389 /* Examples:
390 findCodePoint(c)
391 set list[] c=0 1 3 4 7 8
392 === ============== ===========
393 [] [110000] 0 0 0 0 0 0
394 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
395 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
396 [:Any:] [0, 110000] 1 1 1 1 1 1
397 */
398
399 // Return the smallest i such that c < list[i]. Assume
400 // list[len - 1] == HIGH and that c is legal (0..HIGH-1).
401 if (c < list[0])
402 return 0;
403 // High runner test. c is often after the last range, so an
404 // initial check for this condition pays off.
405 int32_t lo = 0;
406 int32_t hi = len - 1;
407 if (lo >= hi || c >= list[hi-1])
408 return hi;
409 // invariant: c >= list[lo]
410 // invariant: c < list[hi]
411 for (;;) {
412 int32_t i = (lo + hi) >> 1;
413 if (i == lo) {
414 break; // Found!
415 } else if (c < list[i]) {
416 hi = i;
417 } else {
418 lo = i;
419 }
420 }
421 return hi;
422 }
423
424 /**
425 * Returns true if this set contains every character
426 * of the given range.
427 * @param start first character, inclusive, of the range
428 * @param end last character, inclusive, of the range
429 * @return true if the test condition is met
430 */
contains(UChar32 start,UChar32 end) const431 UBool UnicodeSet::contains(UChar32 start, UChar32 end) const {
432 //int32_t i = -1;
433 //for (;;) {
434 // if (start < list[++i]) break;
435 //}
436 int32_t i = findCodePoint(start);
437 return ((i & 1) != 0 && end < list[i]);
438 }
439
440 /**
441 * Returns <tt>true</tt> if this set contains the given
442 * multicharacter string.
443 * @param s string to be checked for containment
444 * @return <tt>true</tt> if this set contains the specified string
445 */
contains(const UnicodeString & s) const446 UBool UnicodeSet::contains(const UnicodeString& s) const {
447 if (s.length() == 0) return FALSE;
448 int32_t cp = getSingleCP(s);
449 if (cp < 0) {
450 return stringsContains(s);
451 } else {
452 return contains((UChar32) cp);
453 }
454 }
455
456 /**
457 * Returns true if this set contains all the characters and strings
458 * of the given set.
459 * @param c set to be checked for containment
460 * @return true if the test condition is met
461 */
containsAll(const UnicodeSet & c) const462 UBool UnicodeSet::containsAll(const UnicodeSet& c) const {
463 // The specified set is a subset if all of its pairs are contained in
464 // this set. It's possible to code this more efficiently in terms of
465 // direct manipulation of the inversion lists if the need arises.
466 int32_t n = c.getRangeCount();
467 for (int i=0; i<n; ++i) {
468 if (!contains(c.getRangeStart(i), c.getRangeEnd(i))) {
469 return FALSE;
470 }
471 }
472 return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings));
473 }
474
475 /**
476 * Returns true if this set contains all the characters
477 * of the given string.
478 * @param s string containing characters to be checked for containment
479 * @return true if the test condition is met
480 */
containsAll(const UnicodeString & s) const481 UBool UnicodeSet::containsAll(const UnicodeString& s) const {
482 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_CONTAINED) ==
483 s.length());
484 }
485
486 /**
487 * Returns true if this set contains none of the characters
488 * of the given range.
489 * @param start first character, inclusive, of the range
490 * @param end last character, inclusive, of the range
491 * @return true if the test condition is met
492 */
containsNone(UChar32 start,UChar32 end) const493 UBool UnicodeSet::containsNone(UChar32 start, UChar32 end) const {
494 //int32_t i = -1;
495 //for (;;) {
496 // if (start < list[++i]) break;
497 //}
498 int32_t i = findCodePoint(start);
499 return ((i & 1) == 0 && end < list[i]);
500 }
501
502 /**
503 * Returns true if this set contains none of the characters and strings
504 * of the given set.
505 * @param c set to be checked for containment
506 * @return true if the test condition is met
507 */
containsNone(const UnicodeSet & c) const508 UBool UnicodeSet::containsNone(const UnicodeSet& c) const {
509 // The specified set is a subset if all of its pairs are contained in
510 // this set. It's possible to code this more efficiently in terms of
511 // direct manipulation of the inversion lists if the need arises.
512 int32_t n = c.getRangeCount();
513 for (int32_t i=0; i<n; ++i) {
514 if (!containsNone(c.getRangeStart(i), c.getRangeEnd(i))) {
515 return FALSE;
516 }
517 }
518 return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings);
519 }
520
521 /**
522 * Returns true if this set contains none of the characters
523 * of the given string.
524 * @param s string containing characters to be checked for containment
525 * @return true if the test condition is met
526 */
containsNone(const UnicodeString & s) const527 UBool UnicodeSet::containsNone(const UnicodeString& s) const {
528 return (UBool)(span(s.getBuffer(), s.length(), USET_SPAN_NOT_CONTAINED) ==
529 s.length());
530 }
531
532 /**
533 * Returns <tt>true</tt> if this set contains any character whose low byte
534 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
535 * indexing.
536 */
matchesIndexValue(uint8_t v) const537 UBool UnicodeSet::matchesIndexValue(uint8_t v) const {
538 /* The index value v, in the range [0,255], is contained in this set if
539 * it is contained in any pair of this set. Pairs either have the high
540 * bytes equal, or unequal. If the high bytes are equal, then we have
541 * aaxx..aayy, where aa is the high byte. Then v is contained if xx <=
542 * v <= yy. If the high bytes are unequal we have aaxx..bbyy, bb>aa.
543 * Then v is contained if xx <= v || v <= yy. (This is identical to the
544 * time zone month containment logic.)
545 */
546 int32_t i;
547 int32_t rangeCount=getRangeCount();
548 for (i=0; i<rangeCount; ++i) {
549 UChar32 low = getRangeStart(i);
550 UChar32 high = getRangeEnd(i);
551 if ((low & ~0xFF) == (high & ~0xFF)) {
552 if ((low & 0xFF) <= v && v <= (high & 0xFF)) {
553 return TRUE;
554 }
555 } else if ((low & 0xFF) <= v || v <= (high & 0xFF)) {
556 return TRUE;
557 }
558 }
559 if (hasStrings()) {
560 for (i=0; i<strings->size(); ++i) {
561 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i);
562 //if (s.length() == 0) {
563 // // Empty strings match everything
564 // return TRUE;
565 //}
566 // assert(s.length() != 0); // We enforce this elsewhere
567 UChar32 c = s.char32At(0);
568 if ((c & 0xFF) == v) {
569 return TRUE;
570 }
571 }
572 }
573 return FALSE;
574 }
575
576 /**
577 * Implementation of UnicodeMatcher::matches(). Always matches the
578 * longest possible multichar string.
579 */
matches(const Replaceable & text,int32_t & offset,int32_t limit,UBool incremental)580 UMatchDegree UnicodeSet::matches(const Replaceable& text,
581 int32_t& offset,
582 int32_t limit,
583 UBool incremental) {
584 if (offset == limit) {
585 // Strings, if any, have length != 0, so we don't worry
586 // about them here. If we ever allow zero-length strings
587 // we much check for them here.
588 if (contains(U_ETHER)) {
589 return incremental ? U_PARTIAL_MATCH : U_MATCH;
590 } else {
591 return U_MISMATCH;
592 }
593 } else {
594 if (hasStrings()) { // try strings first
595
596 // might separate forward and backward loops later
597 // for now they are combined
598
599 // TODO Improve efficiency of this, at least in the forward
600 // direction, if not in both. In the forward direction we
601 // can assume the strings are sorted.
602
603 int32_t i;
604 UBool forward = offset < limit;
605
606 // firstChar is the leftmost char to match in the
607 // forward direction or the rightmost char to match in
608 // the reverse direction.
609 UChar firstChar = text.charAt(offset);
610
611 // If there are multiple strings that can match we
612 // return the longest match.
613 int32_t highWaterLength = 0;
614
615 for (i=0; i<strings->size(); ++i) {
616 const UnicodeString& trial = *(const UnicodeString*)strings->elementAt(i);
617
618 //if (trial.length() == 0) {
619 // return U_MATCH; // null-string always matches
620 //}
621 // assert(trial.length() != 0); // We ensure this elsewhere
622
623 UChar c = trial.charAt(forward ? 0 : trial.length() - 1);
624
625 // Strings are sorted, so we can optimize in the
626 // forward direction.
627 if (forward && c > firstChar) break;
628 if (c != firstChar) continue;
629
630 int32_t matchLen = matchRest(text, offset, limit, trial);
631
632 if (incremental) {
633 int32_t maxLen = forward ? limit-offset : offset-limit;
634 if (matchLen == maxLen) {
635 // We have successfully matched but only up to limit.
636 return U_PARTIAL_MATCH;
637 }
638 }
639
640 if (matchLen == trial.length()) {
641 // We have successfully matched the whole string.
642 if (matchLen > highWaterLength) {
643 highWaterLength = matchLen;
644 }
645 // In the forward direction we know strings
646 // are sorted so we can bail early.
647 if (forward && matchLen < highWaterLength) {
648 break;
649 }
650 continue;
651 }
652 }
653
654 // We've checked all strings without a partial match.
655 // If we have full matches, return the longest one.
656 if (highWaterLength != 0) {
657 offset += forward ? highWaterLength : -highWaterLength;
658 return U_MATCH;
659 }
660 }
661 return UnicodeFilter::matches(text, offset, limit, incremental);
662 }
663 }
664
665 /**
666 * Returns the longest match for s in text at the given position.
667 * If limit > start then match forward from start+1 to limit
668 * matching all characters except s.charAt(0). If limit < start,
669 * go backward starting from start-1 matching all characters
670 * except s.charAt(s.length()-1). This method assumes that the
671 * first character, text.charAt(start), matches s, so it does not
672 * check it.
673 * @param text the text to match
674 * @param start the first character to match. In the forward
675 * direction, text.charAt(start) is matched against s.charAt(0).
676 * In the reverse direction, it is matched against
677 * s.charAt(s.length()-1).
678 * @param limit the limit offset for matching, either last+1 in
679 * the forward direction, or last-1 in the reverse direction,
680 * where last is the index of the last character to match.
681 * @return If part of s matches up to the limit, return |limit -
682 * start|. If all of s matches before reaching the limit, return
683 * s.length(). If there is a mismatch between s and text, return
684 * 0
685 */
matchRest(const Replaceable & text,int32_t start,int32_t limit,const UnicodeString & s)686 int32_t UnicodeSet::matchRest(const Replaceable& text,
687 int32_t start, int32_t limit,
688 const UnicodeString& s) {
689 int32_t i;
690 int32_t maxLen;
691 int32_t slen = s.length();
692 if (start < limit) {
693 maxLen = limit - start;
694 if (maxLen > slen) maxLen = slen;
695 for (i = 1; i < maxLen; ++i) {
696 if (text.charAt(start + i) != s.charAt(i)) return 0;
697 }
698 } else {
699 maxLen = start - limit;
700 if (maxLen > slen) maxLen = slen;
701 --slen; // <=> slen = s.length() - 1;
702 for (i = 1; i < maxLen; ++i) {
703 if (text.charAt(start - i) != s.charAt(slen - i)) return 0;
704 }
705 }
706 return maxLen;
707 }
708
709 /**
710 * Implement of UnicodeMatcher
711 */
addMatchSetTo(UnicodeSet & toUnionTo) const712 void UnicodeSet::addMatchSetTo(UnicodeSet& toUnionTo) const {
713 toUnionTo.addAll(*this);
714 }
715
716 /**
717 * Returns the index of the given character within this set, where
718 * the set is ordered by ascending code point. If the character
719 * is not in this set, return -1. The inverse of this method is
720 * <code>charAt()</code>.
721 * @return an index from 0..size()-1, or -1
722 */
indexOf(UChar32 c) const723 int32_t UnicodeSet::indexOf(UChar32 c) const {
724 if (c < MIN_VALUE || c > MAX_VALUE) {
725 return -1;
726 }
727 int32_t i = 0;
728 int32_t n = 0;
729 for (;;) {
730 UChar32 start = list[i++];
731 if (c < start) {
732 return -1;
733 }
734 UChar32 limit = list[i++];
735 if (c < limit) {
736 return n + c - start;
737 }
738 n += limit - start;
739 }
740 }
741
742 /**
743 * Returns the character at the given index within this set, where
744 * the set is ordered by ascending code point. If the index is
745 * out of range, return (UChar32)-1. The inverse of this method is
746 * <code>indexOf()</code>.
747 * @param index an index from 0..size()-1
748 * @return the character at the given index, or (UChar32)-1.
749 */
charAt(int32_t index) const750 UChar32 UnicodeSet::charAt(int32_t index) const {
751 if (index >= 0) {
752 // len2 is the largest even integer <= len, that is, it is len
753 // for even values and len-1 for odd values. With odd values
754 // the last entry is UNICODESET_HIGH.
755 int32_t len2 = len & ~1;
756 for (int32_t i=0; i < len2;) {
757 UChar32 start = list[i++];
758 int32_t count = list[i++] - start;
759 if (index < count) {
760 return (UChar32)(start + index);
761 }
762 index -= count;
763 }
764 }
765 return (UChar32)-1;
766 }
767
768 /**
769 * Make this object represent the range <code>start - end</code>.
770 * If <code>end > start</code> then this object is set to an
771 * an empty range.
772 *
773 * @param start first character in the set, inclusive
774 * @rparam end last character in the set, inclusive
775 */
set(UChar32 start,UChar32 end)776 UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) {
777 clear();
778 complement(start, end);
779 return *this;
780 }
781
782 /**
783 * Adds the specified range to this set if it is not already
784 * present. If this set already contains the specified range,
785 * the call leaves this set unchanged. If <code>end > start</code>
786 * then an empty range is added, leaving the set unchanged.
787 *
788 * @param start first character, inclusive, of range to be added
789 * to this set.
790 * @param end last character, inclusive, of range to be added
791 * to this set.
792 */
add(UChar32 start,UChar32 end)793 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) {
794 if (pinCodePoint(start) < pinCodePoint(end)) {
795 UChar32 limit = end + 1;
796 // Fast path for adding a new range after the last one.
797 // Odd list length: [..., lastStart, lastLimit, HIGH]
798 if ((len & 1) != 0) {
799 // If the list is empty, set lastLimit low enough to not be adjacent to 0.
800 UChar32 lastLimit = len == 1 ? -2 : list[len - 2];
801 if (lastLimit <= start && !isFrozen() && !isBogus()) {
802 if (lastLimit == start) {
803 // Extend the last range.
804 list[len - 2] = limit;
805 if (limit == UNICODESET_HIGH) {
806 --len;
807 }
808 } else {
809 list[len - 1] = start;
810 if (limit < UNICODESET_HIGH) {
811 if (ensureCapacity(len + 2)) {
812 list[len++] = limit;
813 list[len++] = UNICODESET_HIGH;
814 }
815 } else { // limit == UNICODESET_HIGH
816 if (ensureCapacity(len + 1)) {
817 list[len++] = UNICODESET_HIGH;
818 }
819 }
820 }
821 releasePattern();
822 return *this;
823 }
824 }
825 // This is slow. Could be much faster using findCodePoint(start)
826 // and modifying the list, dealing with adjacent & overlapping ranges.
827 UChar32 range[3] = { start, limit, UNICODESET_HIGH };
828 add(range, 2, 0);
829 } else if (start == end) {
830 add(start);
831 }
832 return *this;
833 }
834
835 // #define DEBUG_US_ADD
836
837 #ifdef DEBUG_US_ADD
838 #include <stdio.h>
dump(UChar32 c)839 void dump(UChar32 c) {
840 if (c <= 0xFF) {
841 printf("%c", (char)c);
842 } else {
843 printf("U+%04X", c);
844 }
845 }
dump(const UChar32 * list,int32_t len)846 void dump(const UChar32* list, int32_t len) {
847 printf("[");
848 for (int32_t i=0; i<len; ++i) {
849 if (i != 0) printf(", ");
850 dump(list[i]);
851 }
852 printf("]");
853 }
854 #endif
855
856 /**
857 * Adds the specified character to this set if it is not already
858 * present. If this set already contains the specified character,
859 * the call leaves this set unchanged.
860 */
add(UChar32 c)861 UnicodeSet& UnicodeSet::add(UChar32 c) {
862 // find smallest i such that c < list[i]
863 // if odd, then it is IN the set
864 // if even, then it is OUT of the set
865 int32_t i = findCodePoint(pinCodePoint(c));
866
867 // already in set?
868 if ((i & 1) != 0 || isFrozen() || isBogus()) return *this;
869
870 // HIGH is 0x110000
871 // assert(list[len-1] == HIGH);
872
873 // empty = [HIGH]
874 // [start_0, limit_0, start_1, limit_1, HIGH]
875
876 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
877 // ^
878 // list[i]
879
880 // i == 0 means c is before the first range
881
882 #ifdef DEBUG_US_ADD
883 printf("Add of ");
884 dump(c);
885 printf(" found at %d", i);
886 printf(": ");
887 dump(list, len);
888 printf(" => ");
889 #endif
890
891 if (c == list[i]-1) {
892 // c is before start of next range
893 list[i] = c;
894 // if we touched the HIGH mark, then add a new one
895 if (c == (UNICODESET_HIGH - 1)) {
896 if (!ensureCapacity(len+1)) {
897 // ensureCapacity will mark the object as Bogus if OOM failure happens.
898 return *this;
899 }
900 list[len++] = UNICODESET_HIGH;
901 }
902 if (i > 0 && c == list[i-1]) {
903 // collapse adjacent ranges
904
905 // [..., start_k-1, c, c, limit_k, ..., HIGH]
906 // ^
907 // list[i]
908
909 //for (int32_t k=i-1; k<len-2; ++k) {
910 // list[k] = list[k+2];
911 //}
912 UChar32* dst = list + i - 1;
913 UChar32* src = dst + 2;
914 UChar32* srclimit = list + len;
915 while (src < srclimit) *(dst++) = *(src++);
916
917 len -= 2;
918 }
919 }
920
921 else if (i > 0 && c == list[i-1]) {
922 // c is after end of prior range
923 list[i-1]++;
924 // no need to check for collapse here
925 }
926
927 else {
928 // At this point we know the new char is not adjacent to
929 // any existing ranges, and it is not 10FFFF.
930
931
932 // [..., start_k-1, limit_k-1, start_k, limit_k, ..., HIGH]
933 // ^
934 // list[i]
935
936 // [..., start_k-1, limit_k-1, c, c+1, start_k, limit_k, ..., HIGH]
937 // ^
938 // list[i]
939
940 if (!ensureCapacity(len+2)) {
941 // ensureCapacity will mark the object as Bogus if OOM failure happens.
942 return *this;
943 }
944
945 UChar32 *p = list + i;
946 uprv_memmove(p + 2, p, (len - i) * sizeof(*p));
947 list[i] = c;
948 list[i+1] = c+1;
949 len += 2;
950 }
951
952 #ifdef DEBUG_US_ADD
953 dump(list, len);
954 printf("\n");
955
956 for (i=1; i<len; ++i) {
957 if (list[i] <= list[i-1]) {
958 // Corrupt array!
959 printf("ERROR: list has been corrupted\n");
960 exit(1);
961 }
962 }
963 #endif
964
965 releasePattern();
966 return *this;
967 }
968
969 /**
970 * Adds the specified multicharacter to this set if it is not already
971 * present. If this set already contains the multicharacter,
972 * the call leaves this set unchanged.
973 * Thus "ch" => {"ch"}
974 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
975 * @param s the source string
976 * @return the modified set, for chaining
977 */
add(const UnicodeString & s)978 UnicodeSet& UnicodeSet::add(const UnicodeString& s) {
979 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
980 int32_t cp = getSingleCP(s);
981 if (cp < 0) {
982 if (!stringsContains(s)) {
983 _add(s);
984 releasePattern();
985 }
986 } else {
987 add((UChar32)cp);
988 }
989 return *this;
990 }
991
992 /**
993 * Adds the given string, in order, to 'strings'. The given string
994 * must have been checked by the caller to not be empty and to not
995 * already be in 'strings'.
996 */
_add(const UnicodeString & s)997 void UnicodeSet::_add(const UnicodeString& s) {
998 if (isFrozen() || isBogus()) {
999 return;
1000 }
1001 UErrorCode ec = U_ZERO_ERROR;
1002 if (strings == nullptr && !allocateStrings(ec)) {
1003 setToBogus();
1004 return;
1005 }
1006 UnicodeString* t = new UnicodeString(s);
1007 if (t == NULL) { // Check for memory allocation error.
1008 setToBogus();
1009 return;
1010 }
1011 strings->sortedInsert(t, compareUnicodeString, ec);
1012 if (U_FAILURE(ec)) {
1013 setToBogus();
1014 delete t;
1015 }
1016 }
1017
1018 /**
1019 * @return a code point IF the string consists of a single one.
1020 * otherwise returns -1.
1021 * @param string to test
1022 */
getSingleCP(const UnicodeString & s)1023 int32_t UnicodeSet::getSingleCP(const UnicodeString& s) {
1024 //if (s.length() < 1) {
1025 // throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
1026 //}
1027 if (s.length() > 2) return -1;
1028 if (s.length() == 1) return s.charAt(0);
1029
1030 // at this point, len = 2
1031 UChar32 cp = s.char32At(0);
1032 if (cp > 0xFFFF) { // is surrogate pair
1033 return cp;
1034 }
1035 return -1;
1036 }
1037
1038 /**
1039 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
1040 * If this set already any particular character, it has no effect on that character.
1041 * @param the source string
1042 * @return the modified set, for chaining
1043 */
addAll(const UnicodeString & s)1044 UnicodeSet& UnicodeSet::addAll(const UnicodeString& s) {
1045 UChar32 cp;
1046 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1047 cp = s.char32At(i);
1048 add(cp);
1049 }
1050 return *this;
1051 }
1052
1053 /**
1054 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1055 * If this set already any particular character, it has no effect on that character.
1056 * @param the source string
1057 * @return the modified set, for chaining
1058 */
retainAll(const UnicodeString & s)1059 UnicodeSet& UnicodeSet::retainAll(const UnicodeString& s) {
1060 UnicodeSet set;
1061 set.addAll(s);
1062 retainAll(set);
1063 return *this;
1064 }
1065
1066 /**
1067 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1068 * If this set already any particular character, it has no effect on that character.
1069 * @param the source string
1070 * @return the modified set, for chaining
1071 */
complementAll(const UnicodeString & s)1072 UnicodeSet& UnicodeSet::complementAll(const UnicodeString& s) {
1073 UnicodeSet set;
1074 set.addAll(s);
1075 complementAll(set);
1076 return *this;
1077 }
1078
1079 /**
1080 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1081 * If this set already any particular character, it has no effect on that character.
1082 * @param the source string
1083 * @return the modified set, for chaining
1084 */
removeAll(const UnicodeString & s)1085 UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) {
1086 UnicodeSet set;
1087 set.addAll(s);
1088 removeAll(set);
1089 return *this;
1090 }
1091
removeAllStrings()1092 UnicodeSet& UnicodeSet::removeAllStrings() {
1093 if (!isFrozen() && hasStrings()) {
1094 strings->removeAllElements();
1095 releasePattern();
1096 }
1097 return *this;
1098 }
1099
1100
1101 /**
1102 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1103 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1104 * @param the source string
1105 * @return a newly created set containing the given string
1106 */
createFrom(const UnicodeString & s)1107 UnicodeSet* U_EXPORT2 UnicodeSet::createFrom(const UnicodeString& s) {
1108 UnicodeSet *set = new UnicodeSet();
1109 if (set != NULL) { // Check for memory allocation error.
1110 set->add(s);
1111 }
1112 return set;
1113 }
1114
1115
1116 /**
1117 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1118 * @param the source string
1119 * @return a newly created set containing the given characters
1120 */
createFromAll(const UnicodeString & s)1121 UnicodeSet* U_EXPORT2 UnicodeSet::createFromAll(const UnicodeString& s) {
1122 UnicodeSet *set = new UnicodeSet();
1123 if (set != NULL) { // Check for memory allocation error.
1124 set->addAll(s);
1125 }
1126 return set;
1127 }
1128
1129 /**
1130 * Retain only the elements in this set that are contained in the
1131 * specified range. If <code>end > start</code> then an empty range is
1132 * retained, leaving the set empty.
1133 *
1134 * @param start first character, inclusive, of range to be retained
1135 * to this set.
1136 * @param end last character, inclusive, of range to be retained
1137 * to this set.
1138 */
retain(UChar32 start,UChar32 end)1139 UnicodeSet& UnicodeSet::retain(UChar32 start, UChar32 end) {
1140 if (pinCodePoint(start) <= pinCodePoint(end)) {
1141 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1142 retain(range, 2, 0);
1143 } else {
1144 clear();
1145 }
1146 return *this;
1147 }
1148
retain(UChar32 c)1149 UnicodeSet& UnicodeSet::retain(UChar32 c) {
1150 return retain(c, c);
1151 }
1152
1153 /**
1154 * Removes the specified range from this set if it is present.
1155 * The set will not contain the specified range once the call
1156 * returns. If <code>end > start</code> then an empty range is
1157 * removed, leaving the set unchanged.
1158 *
1159 * @param start first character, inclusive, of range to be removed
1160 * from this set.
1161 * @param end last character, inclusive, of range to be removed
1162 * from this set.
1163 */
remove(UChar32 start,UChar32 end)1164 UnicodeSet& UnicodeSet::remove(UChar32 start, UChar32 end) {
1165 if (pinCodePoint(start) <= pinCodePoint(end)) {
1166 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1167 retain(range, 2, 2);
1168 }
1169 return *this;
1170 }
1171
1172 /**
1173 * Removes the specified character from this set if it is present.
1174 * The set will not contain the specified range once the call
1175 * returns.
1176 */
remove(UChar32 c)1177 UnicodeSet& UnicodeSet::remove(UChar32 c) {
1178 return remove(c, c);
1179 }
1180
1181 /**
1182 * Removes the specified string from this set if it is present.
1183 * The set will not contain the specified character once the call
1184 * returns.
1185 * @param the source string
1186 * @return the modified set, for chaining
1187 */
remove(const UnicodeString & s)1188 UnicodeSet& UnicodeSet::remove(const UnicodeString& s) {
1189 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1190 int32_t cp = getSingleCP(s);
1191 if (cp < 0) {
1192 if (strings != nullptr && strings->removeElement((void*) &s)) {
1193 releasePattern();
1194 }
1195 } else {
1196 remove((UChar32)cp, (UChar32)cp);
1197 }
1198 return *this;
1199 }
1200
1201 /**
1202 * Complements the specified range in this set. Any character in
1203 * the range will be removed if it is in this set, or will be
1204 * added if it is not in this set. If <code>end > start</code>
1205 * then an empty range is xor'ed, leaving the set unchanged.
1206 *
1207 * @param start first character, inclusive, of range to be removed
1208 * from this set.
1209 * @param end last character, inclusive, of range to be removed
1210 * from this set.
1211 */
complement(UChar32 start,UChar32 end)1212 UnicodeSet& UnicodeSet::complement(UChar32 start, UChar32 end) {
1213 if (isFrozen() || isBogus()) {
1214 return *this;
1215 }
1216 if (pinCodePoint(start) <= pinCodePoint(end)) {
1217 UChar32 range[3] = { start, end+1, UNICODESET_HIGH };
1218 exclusiveOr(range, 2, 0);
1219 }
1220 releasePattern();
1221 return *this;
1222 }
1223
complement(UChar32 c)1224 UnicodeSet& UnicodeSet::complement(UChar32 c) {
1225 return complement(c, c);
1226 }
1227
1228 /**
1229 * This is equivalent to
1230 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1231 */
complement(void)1232 UnicodeSet& UnicodeSet::complement(void) {
1233 if (isFrozen() || isBogus()) {
1234 return *this;
1235 }
1236 if (list[0] == UNICODESET_LOW) {
1237 uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32));
1238 --len;
1239 } else {
1240 if (!ensureCapacity(len+1)) {
1241 return *this;
1242 }
1243 uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32));
1244 list[0] = UNICODESET_LOW;
1245 ++len;
1246 }
1247 releasePattern();
1248 return *this;
1249 }
1250
1251 /**
1252 * Complement the specified string in this set.
1253 * The set will not contain the specified string once the call
1254 * returns.
1255 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1256 * @param s the string to complement
1257 * @return this object, for chaining
1258 */
complement(const UnicodeString & s)1259 UnicodeSet& UnicodeSet::complement(const UnicodeString& s) {
1260 if (s.length() == 0 || isFrozen() || isBogus()) return *this;
1261 int32_t cp = getSingleCP(s);
1262 if (cp < 0) {
1263 if (stringsContains(s)) {
1264 strings->removeElement((void*) &s);
1265 } else {
1266 _add(s);
1267 }
1268 releasePattern();
1269 } else {
1270 complement((UChar32)cp, (UChar32)cp);
1271 }
1272 return *this;
1273 }
1274
1275 /**
1276 * Adds all of the elements in the specified set to this set if
1277 * they're not already present. This operation effectively
1278 * modifies this set so that its value is the <i>union</i> of the two
1279 * sets. The behavior of this operation is unspecified if the specified
1280 * collection is modified while the operation is in progress.
1281 *
1282 * @param c set whose elements are to be added to this set.
1283 * @see #add(char, char)
1284 */
addAll(const UnicodeSet & c)1285 UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) {
1286 if ( c.len>0 && c.list!=NULL ) {
1287 add(c.list, c.len, 0);
1288 }
1289
1290 // Add strings in order
1291 if ( c.strings!=NULL ) {
1292 for (int32_t i=0; i<c.strings->size(); ++i) {
1293 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i);
1294 if (!stringsContains(*s)) {
1295 _add(*s);
1296 }
1297 }
1298 }
1299 return *this;
1300 }
1301
1302 /**
1303 * Retains only the elements in this set that are contained in the
1304 * specified set. In other words, removes from this set all of
1305 * its elements that are not contained in the specified set. This
1306 * operation effectively modifies this set so that its value is
1307 * the <i>intersection</i> of the two sets.
1308 *
1309 * @param c set that defines which elements this set will retain.
1310 */
retainAll(const UnicodeSet & c)1311 UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) {
1312 if (isFrozen() || isBogus()) {
1313 return *this;
1314 }
1315 retain(c.list, c.len, 0);
1316 if (hasStrings()) {
1317 if (!c.hasStrings()) {
1318 strings->removeAllElements();
1319 } else {
1320 strings->retainAll(*c.strings);
1321 }
1322 }
1323 return *this;
1324 }
1325
1326 /**
1327 * Removes from this set all of its elements that are contained in the
1328 * specified set. This operation effectively modifies this
1329 * set so that its value is the <i>asymmetric set difference</i> of
1330 * the two sets.
1331 *
1332 * @param c set that defines which elements will be removed from
1333 * this set.
1334 */
removeAll(const UnicodeSet & c)1335 UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) {
1336 if (isFrozen() || isBogus()) {
1337 return *this;
1338 }
1339 retain(c.list, c.len, 2);
1340 if (hasStrings() && c.hasStrings()) {
1341 strings->removeAll(*c.strings);
1342 }
1343 return *this;
1344 }
1345
1346 /**
1347 * Complements in this set all elements contained in the specified
1348 * set. Any character in the other set will be removed if it is
1349 * in this set, or will be added if it is not in this set.
1350 *
1351 * @param c set that defines which elements will be xor'ed from
1352 * this set.
1353 */
complementAll(const UnicodeSet & c)1354 UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) {
1355 if (isFrozen() || isBogus()) {
1356 return *this;
1357 }
1358 exclusiveOr(c.list, c.len, 0);
1359
1360 if (c.strings != nullptr) {
1361 for (int32_t i=0; i<c.strings->size(); ++i) {
1362 void* e = c.strings->elementAt(i);
1363 if (strings == nullptr || !strings->removeElement(e)) {
1364 _add(*(const UnicodeString*)e);
1365 }
1366 }
1367 }
1368 return *this;
1369 }
1370
1371 /**
1372 * Removes all of the elements from this set. This set will be
1373 * empty after this call returns.
1374 */
clear(void)1375 UnicodeSet& UnicodeSet::clear(void) {
1376 if (isFrozen()) {
1377 return *this;
1378 }
1379 list[0] = UNICODESET_HIGH;
1380 len = 1;
1381 releasePattern();
1382 if (strings != NULL) {
1383 strings->removeAllElements();
1384 }
1385 // Remove bogus
1386 fFlags = 0;
1387 return *this;
1388 }
1389
1390 /**
1391 * Iteration method that returns the number of ranges contained in
1392 * this set.
1393 * @see #getRangeStart
1394 * @see #getRangeEnd
1395 */
getRangeCount() const1396 int32_t UnicodeSet::getRangeCount() const {
1397 return len/2;
1398 }
1399
1400 /**
1401 * Iteration method that returns the first character in the
1402 * specified range of this set.
1403 * @see #getRangeCount
1404 * @see #getRangeEnd
1405 */
getRangeStart(int32_t index) const1406 UChar32 UnicodeSet::getRangeStart(int32_t index) const {
1407 return list[index*2];
1408 }
1409
1410 /**
1411 * Iteration method that returns the last character in the
1412 * specified range of this set.
1413 * @see #getRangeStart
1414 * @see #getRangeEnd
1415 */
getRangeEnd(int32_t index) const1416 UChar32 UnicodeSet::getRangeEnd(int32_t index) const {
1417 return list[index*2 + 1] - 1;
1418 }
1419
getString(int32_t index) const1420 const UnicodeString* UnicodeSet::getString(int32_t index) const {
1421 return (const UnicodeString*) strings->elementAt(index);
1422 }
1423
1424 /**
1425 * Reallocate this objects internal structures to take up the least
1426 * possible space, without changing this object's value.
1427 */
compact()1428 UnicodeSet& UnicodeSet::compact() {
1429 if (isFrozen() || isBogus()) {
1430 return *this;
1431 }
1432 // Delete buffer first to defragment memory less.
1433 if (buffer != stackList) {
1434 uprv_free(buffer);
1435 buffer = NULL;
1436 bufferCapacity = 0;
1437 }
1438 if (list == stackList) {
1439 // pass
1440 } else if (len <= INITIAL_CAPACITY) {
1441 uprv_memcpy(stackList, list, len * sizeof(UChar32));
1442 uprv_free(list);
1443 list = stackList;
1444 capacity = INITIAL_CAPACITY;
1445 } else if ((len + 7) < capacity) {
1446 // If we have more than a little unused capacity, shrink it to len.
1447 UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len);
1448 if (temp) {
1449 list = temp;
1450 capacity = len;
1451 }
1452 // else what the heck happened?! We allocated less memory!
1453 // Oh well. We'll keep our original array.
1454 }
1455 if (strings != nullptr && strings->isEmpty()) {
1456 delete strings;
1457 strings = nullptr;
1458 }
1459 return *this;
1460 }
1461
1462 #ifdef DEBUG_SERIALIZE
1463 #include <stdio.h>
1464 #endif
1465
1466 /**
1467 * Deserialize constructor.
1468 */
UnicodeSet(const uint16_t data[],int32_t dataLen,ESerialization serialization,UErrorCode & ec)1469 UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization,
1470 UErrorCode &ec) {
1471
1472 if(U_FAILURE(ec)) {
1473 setToBogus();
1474 return;
1475 }
1476
1477 if( (serialization != kSerialized)
1478 || (data==NULL)
1479 || (dataLen < 1)) {
1480 ec = U_ILLEGAL_ARGUMENT_ERROR;
1481 setToBogus();
1482 return;
1483 }
1484
1485 // bmp?
1486 int32_t headerSize = ((data[0]&0x8000)) ?2:1;
1487 int32_t bmpLength = (headerSize==1)?data[0]:data[1];
1488
1489 int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength;
1490 #ifdef DEBUG_SERIALIZE
1491 printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]);
1492 #endif
1493 if(!ensureCapacity(newLength + 1)) { // +1 for HIGH
1494 return;
1495 }
1496 // copy bmp
1497 int32_t i;
1498 for(i = 0; i< bmpLength;i++) {
1499 list[i] = data[i+headerSize];
1500 #ifdef DEBUG_SERIALIZE
1501 printf("<<16@%d[%d] %X\n", i+headerSize, i, list[i]);
1502 #endif
1503 }
1504 // copy smp
1505 for(i=bmpLength;i<newLength;i++) {
1506 list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) +
1507 ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]);
1508 #ifdef DEBUG_SERIALIZE
1509 printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]);
1510 #endif
1511 }
1512 U_ASSERT(i == newLength);
1513 if (i == 0 || list[i - 1] != UNICODESET_HIGH) {
1514 list[i++] = UNICODESET_HIGH;
1515 }
1516 len = i;
1517 }
1518
1519
serialize(uint16_t * dest,int32_t destCapacity,UErrorCode & ec) const1520 int32_t UnicodeSet::serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const {
1521 int32_t bmpLength, length, destLength;
1522
1523 if (U_FAILURE(ec)) {
1524 return 0;
1525 }
1526
1527 if (destCapacity<0 || (destCapacity>0 && dest==NULL)) {
1528 ec=U_ILLEGAL_ARGUMENT_ERROR;
1529 return 0;
1530 }
1531
1532 /* count necessary 16-bit units */
1533 length=this->len-1; // Subtract 1 to ignore final UNICODESET_HIGH
1534 // assert(length>=0);
1535 if (length==0) {
1536 /* empty set */
1537 if (destCapacity>0) {
1538 *dest=0;
1539 } else {
1540 ec=U_BUFFER_OVERFLOW_ERROR;
1541 }
1542 return 1;
1543 }
1544 /* now length>0 */
1545
1546 if (this->list[length-1]<=0xffff) {
1547 /* all BMP */
1548 bmpLength=length;
1549 } else if (this->list[0]>=0x10000) {
1550 /* all supplementary */
1551 bmpLength=0;
1552 length*=2;
1553 } else {
1554 /* some BMP, some supplementary */
1555 for (bmpLength=0; bmpLength<length && this->list[bmpLength]<=0xffff; ++bmpLength) {}
1556 length=bmpLength+2*(length-bmpLength);
1557 }
1558 #ifdef DEBUG_SERIALIZE
1559 printf(">> bmpLength%d length%d len%d\n", bmpLength, length, len);
1560 #endif
1561 /* length: number of 16-bit array units */
1562 if (length>0x7fff) {
1563 /* there are only 15 bits for the length in the first serialized word */
1564 ec=U_INDEX_OUTOFBOUNDS_ERROR;
1565 return 0;
1566 }
1567
1568 /*
1569 * total serialized length:
1570 * number of 16-bit array units (length) +
1571 * 1 length unit (always) +
1572 * 1 bmpLength unit (if there are supplementary values)
1573 */
1574 destLength=length+((length>bmpLength)?2:1);
1575 if (destLength<=destCapacity) {
1576 const UChar32 *p;
1577 int32_t i;
1578
1579 #ifdef DEBUG_SERIALIZE
1580 printf("writeHdr\n");
1581 #endif
1582 *dest=(uint16_t)length;
1583 if (length>bmpLength) {
1584 *dest|=0x8000;
1585 *++dest=(uint16_t)bmpLength;
1586 }
1587 ++dest;
1588
1589 /* write the BMP part of the array */
1590 p=this->list;
1591 for (i=0; i<bmpLength; ++i) {
1592 #ifdef DEBUG_SERIALIZE
1593 printf("writebmp: %x\n", (int)*p);
1594 #endif
1595 *dest++=(uint16_t)*p++;
1596 }
1597
1598 /* write the supplementary part of the array */
1599 for (; i<length; i+=2) {
1600 #ifdef DEBUG_SERIALIZE
1601 printf("write32: %x\n", (int)*p);
1602 #endif
1603 *dest++=(uint16_t)(*p>>16);
1604 *dest++=(uint16_t)*p++;
1605 }
1606 } else {
1607 ec=U_BUFFER_OVERFLOW_ERROR;
1608 }
1609 return destLength;
1610 }
1611
1612 //----------------------------------------------------------------
1613 // Implementation: Utility methods
1614 //----------------------------------------------------------------
1615
1616 /**
1617 * Allocate our strings vector and return TRUE if successful.
1618 */
allocateStrings(UErrorCode & status)1619 UBool UnicodeSet::allocateStrings(UErrorCode &status) {
1620 if (U_FAILURE(status)) {
1621 return FALSE;
1622 }
1623 strings = new UVector(uprv_deleteUObject,
1624 uhash_compareUnicodeString, 1, status);
1625 if (strings == NULL) { // Check for memory allocation error.
1626 status = U_MEMORY_ALLOCATION_ERROR;
1627 return FALSE;
1628 }
1629 if (U_FAILURE(status)) {
1630 delete strings;
1631 strings = NULL;
1632 return FALSE;
1633 }
1634 return TRUE;
1635 }
1636
nextCapacity(int32_t minCapacity)1637 int32_t UnicodeSet::nextCapacity(int32_t minCapacity) {
1638 // Grow exponentially to reduce the frequency of allocations.
1639 if (minCapacity < INITIAL_CAPACITY) {
1640 return minCapacity + INITIAL_CAPACITY;
1641 } else if (minCapacity <= 2500) {
1642 return 5 * minCapacity;
1643 } else {
1644 int32_t newCapacity = 2 * minCapacity;
1645 if (newCapacity > MAX_LENGTH) {
1646 newCapacity = MAX_LENGTH;
1647 }
1648 return newCapacity;
1649 }
1650 }
1651
ensureCapacity(int32_t newLen)1652 bool UnicodeSet::ensureCapacity(int32_t newLen) {
1653 if (newLen > MAX_LENGTH) {
1654 newLen = MAX_LENGTH;
1655 }
1656 if (newLen <= capacity) {
1657 return true;
1658 }
1659 int32_t newCapacity = nextCapacity(newLen);
1660 UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1661 if (temp == NULL) {
1662 setToBogus(); // set the object to bogus state if an OOM failure occurred.
1663 return false;
1664 }
1665 // Copy only the actual contents.
1666 uprv_memcpy(temp, list, len * sizeof(UChar32));
1667 if (list != stackList) {
1668 uprv_free(list);
1669 }
1670 list = temp;
1671 capacity = newCapacity;
1672 return true;
1673 }
1674
ensureBufferCapacity(int32_t newLen)1675 bool UnicodeSet::ensureBufferCapacity(int32_t newLen) {
1676 if (newLen > MAX_LENGTH) {
1677 newLen = MAX_LENGTH;
1678 }
1679 if (newLen <= bufferCapacity) {
1680 return true;
1681 }
1682 int32_t newCapacity = nextCapacity(newLen);
1683 UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32));
1684 if (temp == NULL) {
1685 setToBogus();
1686 return false;
1687 }
1688 // The buffer has no contents to be copied.
1689 // It is always filled from scratch after this call.
1690 if (buffer != stackList) {
1691 uprv_free(buffer);
1692 }
1693 buffer = temp;
1694 bufferCapacity = newCapacity;
1695 return true;
1696 }
1697
1698 /**
1699 * Swap list and buffer.
1700 */
swapBuffers(void)1701 void UnicodeSet::swapBuffers(void) {
1702 // swap list and buffer
1703 UChar32* temp = list;
1704 list = buffer;
1705 buffer = temp;
1706
1707 int32_t c = capacity;
1708 capacity = bufferCapacity;
1709 bufferCapacity = c;
1710 }
1711
setToBogus()1712 void UnicodeSet::setToBogus() {
1713 clear(); // Remove everything in the set.
1714 fFlags = kIsBogus;
1715 }
1716
1717 //----------------------------------------------------------------
1718 // Implementation: Fundamental operators
1719 //----------------------------------------------------------------
1720
max(UChar32 a,UChar32 b)1721 static inline UChar32 max(UChar32 a, UChar32 b) {
1722 return (a > b) ? a : b;
1723 }
1724
1725 // polarity = 0, 3 is normal: x xor y
1726 // polarity = 1, 2: x xor ~y == x === y
1727
exclusiveOr(const UChar32 * other,int32_t otherLen,int8_t polarity)1728 void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity) {
1729 if (isFrozen() || isBogus()) {
1730 return;
1731 }
1732 if (!ensureBufferCapacity(len + otherLen)) {
1733 return;
1734 }
1735
1736 int32_t i = 0, j = 0, k = 0;
1737 UChar32 a = list[i++];
1738 UChar32 b;
1739 if (polarity == 1 || polarity == 2) {
1740 b = UNICODESET_LOW;
1741 if (other[j] == UNICODESET_LOW) { // skip base if already LOW
1742 ++j;
1743 b = other[j];
1744 }
1745 } else {
1746 b = other[j++];
1747 }
1748 // simplest of all the routines
1749 // sort the values, discarding identicals!
1750 for (;;) {
1751 if (a < b) {
1752 buffer[k++] = a;
1753 a = list[i++];
1754 } else if (b < a) {
1755 buffer[k++] = b;
1756 b = other[j++];
1757 } else if (a != UNICODESET_HIGH) { // at this point, a == b
1758 // discard both values!
1759 a = list[i++];
1760 b = other[j++];
1761 } else { // DONE!
1762 buffer[k++] = UNICODESET_HIGH;
1763 len = k;
1764 break;
1765 }
1766 }
1767 swapBuffers();
1768 releasePattern();
1769 }
1770
1771 // polarity = 0 is normal: x union y
1772 // polarity = 2: x union ~y
1773 // polarity = 1: ~x union y
1774 // polarity = 3: ~x union ~y
1775
add(const UChar32 * other,int32_t otherLen,int8_t polarity)1776 void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) {
1777 if (isFrozen() || isBogus() || other==NULL) {
1778 return;
1779 }
1780 if (!ensureBufferCapacity(len + otherLen)) {
1781 return;
1782 }
1783
1784 int32_t i = 0, j = 0, k = 0;
1785 UChar32 a = list[i++];
1786 UChar32 b = other[j++];
1787 // change from xor is that we have to check overlapping pairs
1788 // polarity bit 1 means a is second, bit 2 means b is.
1789 for (;;) {
1790 switch (polarity) {
1791 case 0: // both first; take lower if unequal
1792 if (a < b) { // take a
1793 // Back up over overlapping ranges in buffer[]
1794 if (k > 0 && a <= buffer[k-1]) {
1795 // Pick latter end value in buffer[] vs. list[]
1796 a = max(list[i], buffer[--k]);
1797 } else {
1798 // No overlap
1799 buffer[k++] = a;
1800 a = list[i];
1801 }
1802 i++; // Common if/else code factored out
1803 polarity ^= 1;
1804 } else if (b < a) { // take b
1805 if (k > 0 && b <= buffer[k-1]) {
1806 b = max(other[j], buffer[--k]);
1807 } else {
1808 buffer[k++] = b;
1809 b = other[j];
1810 }
1811 j++;
1812 polarity ^= 2;
1813 } else { // a == b, take a, drop b
1814 if (a == UNICODESET_HIGH) goto loop_end;
1815 // This is symmetrical; it doesn't matter if
1816 // we backtrack with a or b. - liu
1817 if (k > 0 && a <= buffer[k-1]) {
1818 a = max(list[i], buffer[--k]);
1819 } else {
1820 // No overlap
1821 buffer[k++] = a;
1822 a = list[i];
1823 }
1824 i++;
1825 polarity ^= 1;
1826 b = other[j++];
1827 polarity ^= 2;
1828 }
1829 break;
1830 case 3: // both second; take higher if unequal, and drop other
1831 if (b <= a) { // take a
1832 if (a == UNICODESET_HIGH) goto loop_end;
1833 buffer[k++] = a;
1834 } else { // take b
1835 if (b == UNICODESET_HIGH) goto loop_end;
1836 buffer[k++] = b;
1837 }
1838 a = list[i++];
1839 polarity ^= 1; // factored common code
1840 b = other[j++];
1841 polarity ^= 2;
1842 break;
1843 case 1: // a second, b first; if b < a, overlap
1844 if (a < b) { // no overlap, take a
1845 buffer[k++] = a; a = list[i++]; polarity ^= 1;
1846 } else if (b < a) { // OVERLAP, drop b
1847 b = other[j++];
1848 polarity ^= 2;
1849 } else { // a == b, drop both!
1850 if (a == UNICODESET_HIGH) goto loop_end;
1851 a = list[i++];
1852 polarity ^= 1;
1853 b = other[j++];
1854 polarity ^= 2;
1855 }
1856 break;
1857 case 2: // a first, b second; if a < b, overlap
1858 if (b < a) { // no overlap, take b
1859 buffer[k++] = b;
1860 b = other[j++];
1861 polarity ^= 2;
1862 } else if (a < b) { // OVERLAP, drop a
1863 a = list[i++];
1864 polarity ^= 1;
1865 } else { // a == b, drop both!
1866 if (a == UNICODESET_HIGH) goto loop_end;
1867 a = list[i++];
1868 polarity ^= 1;
1869 b = other[j++];
1870 polarity ^= 2;
1871 }
1872 break;
1873 }
1874 }
1875 loop_end:
1876 buffer[k++] = UNICODESET_HIGH; // terminate
1877 len = k;
1878 swapBuffers();
1879 releasePattern();
1880 }
1881
1882 // polarity = 0 is normal: x intersect y
1883 // polarity = 2: x intersect ~y == set-minus
1884 // polarity = 1: ~x intersect y
1885 // polarity = 3: ~x intersect ~y
1886
retain(const UChar32 * other,int32_t otherLen,int8_t polarity)1887 void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) {
1888 if (isFrozen() || isBogus()) {
1889 return;
1890 }
1891 if (!ensureBufferCapacity(len + otherLen)) {
1892 return;
1893 }
1894
1895 int32_t i = 0, j = 0, k = 0;
1896 UChar32 a = list[i++];
1897 UChar32 b = other[j++];
1898 // change from xor is that we have to check overlapping pairs
1899 // polarity bit 1 means a is second, bit 2 means b is.
1900 for (;;) {
1901 switch (polarity) {
1902 case 0: // both first; drop the smaller
1903 if (a < b) { // drop a
1904 a = list[i++];
1905 polarity ^= 1;
1906 } else if (b < a) { // drop b
1907 b = other[j++];
1908 polarity ^= 2;
1909 } else { // a == b, take one, drop other
1910 if (a == UNICODESET_HIGH) goto loop_end;
1911 buffer[k++] = a;
1912 a = list[i++];
1913 polarity ^= 1;
1914 b = other[j++];
1915 polarity ^= 2;
1916 }
1917 break;
1918 case 3: // both second; take lower if unequal
1919 if (a < b) { // take a
1920 buffer[k++] = a;
1921 a = list[i++];
1922 polarity ^= 1;
1923 } else if (b < a) { // take b
1924 buffer[k++] = b;
1925 b = other[j++];
1926 polarity ^= 2;
1927 } else { // a == b, take one, drop other
1928 if (a == UNICODESET_HIGH) goto loop_end;
1929 buffer[k++] = a;
1930 a = list[i++];
1931 polarity ^= 1;
1932 b = other[j++];
1933 polarity ^= 2;
1934 }
1935 break;
1936 case 1: // a second, b first;
1937 if (a < b) { // NO OVERLAP, drop a
1938 a = list[i++];
1939 polarity ^= 1;
1940 } else if (b < a) { // OVERLAP, take b
1941 buffer[k++] = b;
1942 b = other[j++];
1943 polarity ^= 2;
1944 } else { // a == b, drop both!
1945 if (a == UNICODESET_HIGH) goto loop_end;
1946 a = list[i++];
1947 polarity ^= 1;
1948 b = other[j++];
1949 polarity ^= 2;
1950 }
1951 break;
1952 case 2: // a first, b second; if a < b, overlap
1953 if (b < a) { // no overlap, drop b
1954 b = other[j++];
1955 polarity ^= 2;
1956 } else if (a < b) { // OVERLAP, take a
1957 buffer[k++] = a;
1958 a = list[i++];
1959 polarity ^= 1;
1960 } else { // a == b, drop both!
1961 if (a == UNICODESET_HIGH) goto loop_end;
1962 a = list[i++];
1963 polarity ^= 1;
1964 b = other[j++];
1965 polarity ^= 2;
1966 }
1967 break;
1968 }
1969 }
1970 loop_end:
1971 buffer[k++] = UNICODESET_HIGH; // terminate
1972 len = k;
1973 swapBuffers();
1974 releasePattern();
1975 }
1976
1977 /**
1978 * Append the <code>toPattern()</code> representation of a
1979 * string to the given <code>StringBuffer</code>.
1980 */
_appendToPat(UnicodeString & buf,const UnicodeString & s,UBool escapeUnprintable)1981 void UnicodeSet::_appendToPat(UnicodeString& buf, const UnicodeString& s, UBool
1982 escapeUnprintable) {
1983 UChar32 cp;
1984 for (int32_t i = 0; i < s.length(); i += U16_LENGTH(cp)) {
1985 _appendToPat(buf, cp = s.char32At(i), escapeUnprintable);
1986 }
1987 }
1988
1989 /**
1990 * Append the <code>toPattern()</code> representation of a
1991 * character to the given <code>StringBuffer</code>.
1992 */
_appendToPat(UnicodeString & buf,UChar32 c,UBool escapeUnprintable)1993 void UnicodeSet::_appendToPat(UnicodeString& buf, UChar32 c, UBool
1994 escapeUnprintable) {
1995 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
1996 // Use hex escape notation (\uxxxx or \Uxxxxxxxx) for anything
1997 // unprintable
1998 if (ICU_Utility::escapeUnprintable(buf, c)) {
1999 return;
2000 }
2001 }
2002 // Okay to let ':' pass through
2003 switch (c) {
2004 case SET_OPEN:
2005 case SET_CLOSE:
2006 case HYPHEN:
2007 case COMPLEMENT:
2008 case INTERSECTION:
2009 case BACKSLASH:
2010 case OPEN_BRACE:
2011 case CLOSE_BRACE:
2012 case COLON:
2013 case SymbolTable::SYMBOL_REF:
2014 buf.append(BACKSLASH);
2015 break;
2016 default:
2017 // Escape whitespace
2018 if (PatternProps::isWhiteSpace(c)) {
2019 buf.append(BACKSLASH);
2020 }
2021 break;
2022 }
2023 buf.append(c);
2024 }
2025
2026 /**
2027 * Append a string representation of this set to result. This will be
2028 * a cleaned version of the string passed to applyPattern(), if there
2029 * is one. Otherwise it will be generated.
2030 */
_toPattern(UnicodeString & result,UBool escapeUnprintable) const2031 UnicodeString& UnicodeSet::_toPattern(UnicodeString& result,
2032 UBool escapeUnprintable) const
2033 {
2034 if (pat != NULL) {
2035 int32_t i;
2036 int32_t backslashCount = 0;
2037 for (i=0; i<patLen; ) {
2038 UChar32 c;
2039 U16_NEXT(pat, i, patLen, c);
2040 if (escapeUnprintable && ICU_Utility::isUnprintable(c)) {
2041 // If the unprintable character is preceded by an odd
2042 // number of backslashes, then it has been escaped.
2043 // Before unescaping it, we delete the final
2044 // backslash.
2045 if ((backslashCount % 2) == 1) {
2046 result.truncate(result.length() - 1);
2047 }
2048 ICU_Utility::escapeUnprintable(result, c);
2049 backslashCount = 0;
2050 } else {
2051 result.append(c);
2052 if (c == BACKSLASH) {
2053 ++backslashCount;
2054 } else {
2055 backslashCount = 0;
2056 }
2057 }
2058 }
2059 return result;
2060 }
2061
2062 return _generatePattern(result, escapeUnprintable);
2063 }
2064
2065 /**
2066 * Returns a string representation of this set. If the result of
2067 * calling this function is passed to a UnicodeSet constructor, it
2068 * will produce another set that is equal to this one.
2069 */
toPattern(UnicodeString & result,UBool escapeUnprintable) const2070 UnicodeString& UnicodeSet::toPattern(UnicodeString& result,
2071 UBool escapeUnprintable) const
2072 {
2073 result.truncate(0);
2074 return _toPattern(result, escapeUnprintable);
2075 }
2076
2077 /**
2078 * Generate and append a string representation of this set to result.
2079 * This does not use this.pat, the cleaned up copy of the string
2080 * passed to applyPattern().
2081 */
_generatePattern(UnicodeString & result,UBool escapeUnprintable) const2082 UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result,
2083 UBool escapeUnprintable) const
2084 {
2085 result.append(SET_OPEN);
2086
2087 // // Check against the predefined categories. We implicitly build
2088 // // up ALL category sets the first time toPattern() is called.
2089 // for (int8_t cat=0; cat<Unicode::GENERAL_TYPES_COUNT; ++cat) {
2090 // if (*this == getCategorySet(cat)) {
2091 // result.append(COLON);
2092 // result.append(CATEGORY_NAMES, cat*2, 2);
2093 // return result.append(CATEGORY_CLOSE);
2094 // }
2095 // }
2096
2097 int32_t count = getRangeCount();
2098
2099 // If the set contains at least 2 intervals and includes both
2100 // MIN_VALUE and MAX_VALUE, then the inverse representation will
2101 // be more economical.
2102 if (count > 1 &&
2103 getRangeStart(0) == MIN_VALUE &&
2104 getRangeEnd(count-1) == MAX_VALUE) {
2105
2106 // Emit the inverse
2107 result.append(COMPLEMENT);
2108
2109 for (int32_t i = 1; i < count; ++i) {
2110 UChar32 start = getRangeEnd(i-1)+1;
2111 UChar32 end = getRangeStart(i)-1;
2112 _appendToPat(result, start, escapeUnprintable);
2113 if (start != end) {
2114 if ((start+1) != end) {
2115 result.append(HYPHEN);
2116 }
2117 _appendToPat(result, end, escapeUnprintable);
2118 }
2119 }
2120 }
2121
2122 // Default; emit the ranges as pairs
2123 else {
2124 for (int32_t i = 0; i < count; ++i) {
2125 UChar32 start = getRangeStart(i);
2126 UChar32 end = getRangeEnd(i);
2127 _appendToPat(result, start, escapeUnprintable);
2128 if (start != end) {
2129 if ((start+1) != end) {
2130 result.append(HYPHEN);
2131 }
2132 _appendToPat(result, end, escapeUnprintable);
2133 }
2134 }
2135 }
2136
2137 if (strings != nullptr) {
2138 for (int32_t i = 0; i<strings->size(); ++i) {
2139 result.append(OPEN_BRACE);
2140 _appendToPat(result,
2141 *(const UnicodeString*) strings->elementAt(i),
2142 escapeUnprintable);
2143 result.append(CLOSE_BRACE);
2144 }
2145 }
2146 return result.append(SET_CLOSE);
2147 }
2148
2149 /**
2150 * Release existing cached pattern
2151 */
releasePattern()2152 void UnicodeSet::releasePattern() {
2153 if (pat) {
2154 uprv_free(pat);
2155 pat = NULL;
2156 patLen = 0;
2157 }
2158 }
2159
2160 /**
2161 * Set the new pattern to cache.
2162 */
setPattern(const char16_t * newPat,int32_t newPatLen)2163 void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) {
2164 releasePattern();
2165 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar));
2166 if (pat) {
2167 patLen = newPatLen;
2168 u_memcpy(pat, newPat, patLen);
2169 pat[patLen] = 0;
2170 }
2171 // else we don't care if malloc failed. This was just a nice cache.
2172 // We can regenerate an equivalent pattern later when requested.
2173 }
2174
freeze()2175 UnicodeSet *UnicodeSet::freeze() {
2176 if(!isFrozen() && !isBogus()) {
2177 compact();
2178
2179 // Optimize contains() and span() and similar functions.
2180 if (hasStrings()) {
2181 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL);
2182 if (stringSpan == nullptr) {
2183 setToBogus();
2184 return this;
2185 } else if (!stringSpan->needsStringSpanUTF16()) {
2186 // All strings are irrelevant for span() etc. because
2187 // all of each string's code points are contained in this set.
2188 // Do not check needsStringSpanUTF8() because UTF-8 has at most as
2189 // many relevant strings as UTF-16.
2190 // (Thus needsStringSpanUTF8() implies needsStringSpanUTF16().)
2191 delete stringSpan;
2192 stringSpan = NULL;
2193 }
2194 }
2195 if (stringSpan == NULL) {
2196 // No span-relevant strings: Optimize for code point spans.
2197 bmpSet=new BMPSet(list, len);
2198 if (bmpSet == NULL) { // Check for memory allocation error.
2199 setToBogus();
2200 }
2201 }
2202 }
2203 return this;
2204 }
2205
span(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2206 int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2207 if(length>0 && bmpSet!=NULL) {
2208 return (int32_t)(bmpSet->span(s, s+length, spanCondition)-s);
2209 }
2210 if(length<0) {
2211 length=u_strlen(s);
2212 }
2213 if(length==0) {
2214 return 0;
2215 }
2216 if(stringSpan!=NULL) {
2217 return stringSpan->span(s, length, spanCondition);
2218 } else if(hasStrings()) {
2219 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2220 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED :
2221 UnicodeSetStringSpan::FWD_UTF16_CONTAINED;
2222 UnicodeSetStringSpan strSpan(*this, *strings, which);
2223 if(strSpan.needsStringSpanUTF16()) {
2224 return strSpan.span(s, length, spanCondition);
2225 }
2226 }
2227
2228 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2229 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2230 }
2231
2232 UChar32 c;
2233 int32_t start=0, prev=0;
2234 do {
2235 U16_NEXT(s, start, length, c);
2236 if(spanCondition!=contains(c)) {
2237 break;
2238 }
2239 } while((prev=start)<length);
2240 return prev;
2241 }
2242
spanBack(const UChar * s,int32_t length,USetSpanCondition spanCondition) const2243 int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const {
2244 if(length>0 && bmpSet!=NULL) {
2245 return (int32_t)(bmpSet->spanBack(s, s+length, spanCondition)-s);
2246 }
2247 if(length<0) {
2248 length=u_strlen(s);
2249 }
2250 if(length==0) {
2251 return 0;
2252 }
2253 if(stringSpan!=NULL) {
2254 return stringSpan->spanBack(s, length, spanCondition);
2255 } else if(hasStrings()) {
2256 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2257 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED :
2258 UnicodeSetStringSpan::BACK_UTF16_CONTAINED;
2259 UnicodeSetStringSpan strSpan(*this, *strings, which);
2260 if(strSpan.needsStringSpanUTF16()) {
2261 return strSpan.spanBack(s, length, spanCondition);
2262 }
2263 }
2264
2265 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2266 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2267 }
2268
2269 UChar32 c;
2270 int32_t prev=length;
2271 do {
2272 U16_PREV(s, 0, length, c);
2273 if(spanCondition!=contains(c)) {
2274 break;
2275 }
2276 } while((prev=length)>0);
2277 return prev;
2278 }
2279
spanUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2280 int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2281 if(length>0 && bmpSet!=NULL) {
2282 const uint8_t *s0=(const uint8_t *)s;
2283 return (int32_t)(bmpSet->spanUTF8(s0, length, spanCondition)-s0);
2284 }
2285 if(length<0) {
2286 length=(int32_t)uprv_strlen(s);
2287 }
2288 if(length==0) {
2289 return 0;
2290 }
2291 if(stringSpan!=NULL) {
2292 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition);
2293 } else if(hasStrings()) {
2294 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2295 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED :
2296 UnicodeSetStringSpan::FWD_UTF8_CONTAINED;
2297 UnicodeSetStringSpan strSpan(*this, *strings, which);
2298 if(strSpan.needsStringSpanUTF8()) {
2299 return strSpan.spanUTF8((const uint8_t *)s, length, spanCondition);
2300 }
2301 }
2302
2303 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2304 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2305 }
2306
2307 UChar32 c;
2308 int32_t start=0, prev=0;
2309 do {
2310 U8_NEXT_OR_FFFD(s, start, length, c);
2311 if(spanCondition!=contains(c)) {
2312 break;
2313 }
2314 } while((prev=start)<length);
2315 return prev;
2316 }
2317
spanBackUTF8(const char * s,int32_t length,USetSpanCondition spanCondition) const2318 int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const {
2319 if(length>0 && bmpSet!=NULL) {
2320 const uint8_t *s0=(const uint8_t *)s;
2321 return bmpSet->spanBackUTF8(s0, length, spanCondition);
2322 }
2323 if(length<0) {
2324 length=(int32_t)uprv_strlen(s);
2325 }
2326 if(length==0) {
2327 return 0;
2328 }
2329 if(stringSpan!=NULL) {
2330 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition);
2331 } else if(hasStrings()) {
2332 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ?
2333 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED :
2334 UnicodeSetStringSpan::BACK_UTF8_CONTAINED;
2335 UnicodeSetStringSpan strSpan(*this, *strings, which);
2336 if(strSpan.needsStringSpanUTF8()) {
2337 return strSpan.spanBackUTF8((const uint8_t *)s, length, spanCondition);
2338 }
2339 }
2340
2341 if(spanCondition!=USET_SPAN_NOT_CONTAINED) {
2342 spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values.
2343 }
2344
2345 UChar32 c;
2346 int32_t prev=length;
2347 do {
2348 U8_PREV_OR_FFFD(s, 0, length, c);
2349 if(spanCondition!=contains(c)) {
2350 break;
2351 }
2352 } while((prev=length)>0);
2353 return prev;
2354 }
2355
2356 U_NAMESPACE_END
2357