1diff --git a/source/common/characterproperties.cpp b/source/common/characterproperties.cpp 2index 3aff85b3..b416ef52 100644 3--- a/source/common/characterproperties.cpp 4+++ b/source/common/characterproperties.cpp 5@@ -23,6 +23,9 @@ 6 #include "umutex.h" 7 #include "uprops.h" 8 9+using icu::LocalPointer; 10+using icu::Normalizer2Factory; 11+using icu::Normalizer2Impl; 12 using icu::UInitOnce; 13 using icu::UnicodeSet; 14 15@@ -30,11 +33,13 @@ namespace { 16 17 UBool U_CALLCONV characterproperties_cleanup(); 18 19+constexpr int32_t NUM_INCLUSIONS = UPROPS_SRC_COUNT + UCHAR_INT_LIMIT - UCHAR_INT_START; 20+ 21 struct Inclusion { 22 UnicodeSet *fSet; 23 UInitOnce fInitOnce; 24 }; 25-Inclusion gInclusions[UPROPS_SRC_COUNT]; // cached getInclusions() 26+Inclusion gInclusions[NUM_INCLUSIONS]; // cached getInclusions() 27 28 UnicodeSet *sets[UCHAR_BINARY_LIMIT] = {}; 29 30@@ -80,35 +85,22 @@ UBool U_CALLCONV characterproperties_cleanup() { 31 return TRUE; 32 } 33 34-} // namespace 35- 36-U_NAMESPACE_BEGIN 37- 38-/* 39-Reduce excessive reallocation, and make it easier to detect initialization problems. 40-Usually you don't see smaller sets than this for Unicode 5.0. 41-*/ 42-constexpr int32_t DEFAULT_INCLUSION_CAPACITY = 3072; 43- 44-void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCode &errorCode) { 45+void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode) { 46 // This function is invoked only via umtx_initOnce(). 47- // This function is a friend of class UnicodeSet. 48- 49 U_ASSERT(0 <= src && src < UPROPS_SRC_COUNT); 50 if (src == UPROPS_SRC_NONE) { 51 errorCode = U_INTERNAL_PROGRAM_ERROR; 52 return; 53 } 54- UnicodeSet * &incl = gInclusions[src].fSet; 55- U_ASSERT(incl == nullptr); 56+ U_ASSERT(gInclusions[src].fSet == nullptr); 57 58- incl = new UnicodeSet(); 59- if (incl == nullptr) { 60+ LocalPointer<UnicodeSet> incl(new UnicodeSet()); 61+ if (incl.isNull()) { 62 errorCode = U_MEMORY_ALLOCATION_ERROR; 63 return; 64 } 65 USetAdder sa = { 66- (USet *)incl, 67+ (USet *)incl.getAlias(), 68 _set_add, 69 _set_addRange, 70 _set_addString, 71@@ -116,7 +108,6 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo 72 nullptr // don't need removeRange() 73 }; 74 75- incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, errorCode); 76 switch(src) { 77 case UPROPS_SRC_CHAR: 78 uchar_addPropertyStarts(&sa, &errorCode); 79@@ -183,12 +174,15 @@ void U_CALLCONV CharacterProperties::initInclusion(UPropertySource src, UErrorCo 80 } 81 82 if (U_FAILURE(errorCode)) { 83- delete incl; 84- incl = nullptr; 85 return; 86 } 87- // Compact for caching 88+ if (incl->isBogus()) { 89+ errorCode = U_MEMORY_ALLOCATION_ERROR; 90+ return; 91+ } 92+ // Compact for caching. 93 incl->compact(); 94+ gInclusions[src].fSet = incl.orphan(); 95 ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); 96 } 97 98@@ -199,15 +193,66 @@ const UnicodeSet *getInclusionsForSource(UPropertySource src, UErrorCode &errorC 99 return nullptr; 100 } 101 Inclusion &i = gInclusions[src]; 102- umtx_initOnce(i.fInitOnce, &CharacterProperties::initInclusion, src, errorCode); 103+ umtx_initOnce(i.fInitOnce, &initInclusion, src, errorCode); 104 return i.fSet; 105 } 106 107+void U_CALLCONV initIntPropInclusion(UProperty prop, UErrorCode &errorCode) { 108+ // This function is invoked only via umtx_initOnce(). 109+ U_ASSERT(UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT); 110+ int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; 111+ U_ASSERT(gInclusions[inclIndex].fSet == nullptr); 112+ UPropertySource src = uprops_getSource(prop); 113+ const UnicodeSet *incl = getInclusionsForSource(src, errorCode); 114+ if (U_FAILURE(errorCode)) { 115+ return; 116+ } 117+ 118+ LocalPointer<UnicodeSet> intPropIncl(new UnicodeSet(0, 0)); 119+ if (intPropIncl.isNull()) { 120+ errorCode = U_MEMORY_ALLOCATION_ERROR; 121+ return; 122+ } 123+ int32_t numRanges = incl->getRangeCount(); 124+ int32_t prevValue = 0; 125+ for (int32_t i = 0; i < numRanges; ++i) { 126+ UChar32 rangeEnd = incl->getRangeEnd(i); 127+ for (UChar32 c = incl->getRangeStart(i); c <= rangeEnd; ++c) { 128+ // TODO: Get a UCharacterProperty.IntProperty to avoid the property dispatch. 129+ int32_t value = u_getIntPropertyValue(c, prop); 130+ if (value != prevValue) { 131+ intPropIncl->add(c); 132+ prevValue = value; 133+ } 134+ } 135+ } 136+ 137+ if (intPropIncl->isBogus()) { 138+ errorCode = U_MEMORY_ALLOCATION_ERROR; 139+ return; 140+ } 141+ // Compact for caching. 142+ intPropIncl->compact(); 143+ gInclusions[inclIndex].fSet = intPropIncl.orphan(); 144+ ucln_common_registerCleanup(UCLN_COMMON_CHARACTERPROPERTIES, characterproperties_cleanup); 145+} 146+ 147+} // namespace 148+ 149+U_NAMESPACE_BEGIN 150+ 151 const UnicodeSet *CharacterProperties::getInclusionsForProperty( 152 UProperty prop, UErrorCode &errorCode) { 153 if (U_FAILURE(errorCode)) { return nullptr; } 154- UPropertySource src = uprops_getSource(prop); 155- return getInclusionsForSource(src, errorCode); 156+ if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { 157+ int32_t inclIndex = UPROPS_SRC_COUNT + prop - UCHAR_INT_START; 158+ Inclusion &i = gInclusions[inclIndex]; 159+ umtx_initOnce(i.fInitOnce, &initIntPropInclusion, prop, errorCode); 160+ return i.fSet; 161+ } else { 162+ UPropertySource src = uprops_getSource(prop); 163+ return getInclusionsForSource(src, errorCode); 164+ } 165 } 166 167 U_NAMESPACE_END 168@@ -216,7 +261,7 @@ namespace { 169 170 UnicodeSet *makeSet(UProperty property, UErrorCode &errorCode) { 171 if (U_FAILURE(errorCode)) { return nullptr; } 172- icu::LocalPointer<UnicodeSet> set(new UnicodeSet()); 173+ LocalPointer<UnicodeSet> set(new UnicodeSet()); 174 if (set.isNull()) { 175 errorCode = U_MEMORY_ALLOCATION_ERROR; 176 return nullptr; 177diff --git a/source/common/ucptrie.cpp b/source/common/ucptrie.cpp 178index 13496ad5..b72e3183 100644 179--- a/source/common/ucptrie.cpp 180+++ b/source/common/ucptrie.cpp 181@@ -280,7 +280,7 @@ UChar32 getRange(const void *t, UChar32 start, 182 int32_t prevI3Block = -1; 183 int32_t prevBlock = -1; 184 UChar32 c = start; 185- uint32_t value; 186+ uint32_t trieValue, value; 187 bool haveValue = false; 188 do { 189 int32_t i3Block; 190@@ -319,6 +319,7 @@ UChar32 getRange(const void *t, UChar32 start, 191 return c - 1; 192 } 193 } else { 194+ trieValue = trie->nullValue; 195 value = nullValue; 196 if (pValue != nullptr) { *pValue = nullValue; } 197 haveValue = true; 198@@ -357,6 +358,7 @@ UChar32 getRange(const void *t, UChar32 start, 199 return c - 1; 200 } 201 } else { 202+ trieValue = trie->nullValue; 203 value = nullValue; 204 if (pValue != nullptr) { *pValue = nullValue; } 205 haveValue = true; 206@@ -364,23 +366,32 @@ UChar32 getRange(const void *t, UChar32 start, 207 c = (c + dataBlockLength) & ~dataMask; 208 } else { 209 int32_t di = block + (c & dataMask); 210- uint32_t value2 = getValue(trie->data, valueWidth, di); 211- value2 = maybeFilterValue(value2, trie->nullValue, nullValue, 212- filter, context); 213+ uint32_t trieValue2 = getValue(trie->data, valueWidth, di); 214 if (haveValue) { 215- if (value2 != value) { 216- return c - 1; 217+ if (trieValue2 != trieValue) { 218+ if (filter == nullptr || 219+ maybeFilterValue(trieValue2, trie->nullValue, nullValue, 220+ filter, context) != value) { 221+ return c - 1; 222+ } 223+ trieValue = trieValue2; // may or may not help 224 } 225 } else { 226- value = value2; 227+ trieValue = trieValue2; 228+ value = maybeFilterValue(trieValue2, trie->nullValue, nullValue, 229+ filter, context); 230 if (pValue != nullptr) { *pValue = value; } 231 haveValue = true; 232 } 233 while ((++c & dataMask) != 0) { 234- if (maybeFilterValue(getValue(trie->data, valueWidth, ++di), 235- trie->nullValue, nullValue, 236- filter, context) != value) { 237- return c - 1; 238+ trieValue2 = getValue(trie->data, valueWidth, ++di); 239+ if (trieValue2 != trieValue) { 240+ if (filter == nullptr || 241+ maybeFilterValue(trieValue2, trie->nullValue, nullValue, 242+ filter, context) != value) { 243+ return c - 1; 244+ } 245+ trieValue = trieValue2; // may or may not help 246 } 247 } 248 } 249diff --git a/source/common/umutablecptrie.cpp b/source/common/umutablecptrie.cpp 250index 44af8309..926be468 100644 251--- a/source/common/umutablecptrie.cpp 252+++ b/source/common/umutablecptrie.cpp 253@@ -304,41 +304,56 @@ UChar32 MutableCodePointTrie::getRange( 254 uint32_t nullValue = initialValue; 255 if (filter != nullptr) { nullValue = filter(context, nullValue); } 256 UChar32 c = start; 257- uint32_t value; 258+ uint32_t trieValue, value; 259 bool haveValue = false; 260 int32_t i = c >> UCPTRIE_SHIFT_3; 261 do { 262 if (flags[i] == ALL_SAME) { 263- uint32_t value2 = maybeFilterValue(index[i], initialValue, nullValue, 264- filter, context); 265+ uint32_t trieValue2 = index[i]; 266 if (haveValue) { 267- if (value2 != value) { 268- return c - 1; 269+ if (trieValue2 != trieValue) { 270+ if (filter == nullptr || 271+ maybeFilterValue(trieValue2, initialValue, nullValue, 272+ filter, context) != value) { 273+ return c - 1; 274+ } 275+ trieValue = trieValue2; // may or may not help 276 } 277 } else { 278- value = value2; 279+ trieValue = trieValue2; 280+ value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); 281 if (pValue != nullptr) { *pValue = value; } 282 haveValue = true; 283 } 284 c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK; 285 } else /* MIXED */ { 286 int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK); 287- uint32_t value2 = maybeFilterValue(data[di], initialValue, nullValue, 288- filter, context); 289+ uint32_t trieValue2 = data[di]; 290 if (haveValue) { 291- if (value2 != value) { 292- return c - 1; 293+ if (trieValue2 != trieValue) { 294+ if (filter == nullptr || 295+ maybeFilterValue(trieValue2, initialValue, nullValue, 296+ filter, context) != value) { 297+ return c - 1; 298+ } 299+ trieValue = trieValue2; // may or may not help 300 } 301 } else { 302- value = value2; 303+ trieValue = trieValue2; 304+ value = maybeFilterValue(trieValue2, initialValue, nullValue, filter, context); 305 if (pValue != nullptr) { *pValue = value; } 306 haveValue = true; 307 } 308 while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) { 309- if (maybeFilterValue(data[++di], initialValue, nullValue, 310- filter, context) != value) { 311- return c - 1; 312+ trieValue2 = data[++di]; 313+ if (trieValue2 != trieValue) { 314+ if (filter == nullptr || 315+ maybeFilterValue(trieValue2, initialValue, nullValue, 316+ filter, context) != value) { 317+ return c - 1; 318+ } 319 } 320+ trieValue = trieValue2; // may or may not help 321 } 322 } 323 ++i; 324diff --git a/source/common/unicode/uniset.h b/source/common/unicode/uniset.h 325index 0abc7542..af56b872 100644 326--- a/source/common/unicode/uniset.h 327+++ b/source/common/unicode/uniset.h 328@@ -27,7 +27,6 @@ U_NAMESPACE_BEGIN 329 330 // Forward Declarations. 331 class BMPSet; 332-class CharacterProperties; 333 class ParsePosition; 334 class RBBIRuleScanner; 335 class SymbolTable; 336@@ -276,14 +275,23 @@ class RuleCharacterIterator; 337 * @stable ICU 2.0 338 */ 339 class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { 340+private: 341+ /** 342+ * Enough for sets with few ranges. 343+ * For example, White_Space has 10 ranges, list length 21. 344+ */ 345+ static constexpr int32_t INITIAL_CAPACITY = 25; 346+ // fFlags constant 347+ static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid) 348+ 349+ UChar32* list = stackList; // MUST be terminated with HIGH 350+ int32_t capacity = INITIAL_CAPACITY; // capacity of list 351+ int32_t len = 1; // length of list used; 1 <= len <= capacity 352+ uint8_t fFlags = 0; // Bit flag (see constants above) 353 354- int32_t len; // length of list used; 0 <= len <= capacity 355- int32_t capacity; // capacity of list 356- UChar32* list; // MUST be terminated with HIGH 357- BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL. 358- UChar32* buffer; // internal buffer, may be NULL 359- int32_t bufferCapacity; // capacity of buffer 360- int32_t patLen; 361+ BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL. 362+ UChar32* buffer = nullptr; // internal buffer, may be NULL 363+ int32_t bufferCapacity = 0; // capacity of buffer 364 365 /** 366 * The pattern representation of this set. This may not be the 367@@ -294,15 +302,19 @@ class U_COMMON_API UnicodeSet U_FINAL : public UnicodeFilter { 368 * indicating that toPattern() must generate a pattern 369 * representation from the inversion list. 370 */ 371- char16_t *pat; 372- UVector* strings; // maintained in sorted order 373- UnicodeSetStringSpan *stringSpan; 374+ char16_t *pat = nullptr; 375+ int32_t patLen = 0; 376+ 377+ UVector* strings = nullptr; // maintained in sorted order 378+ UnicodeSetStringSpan *stringSpan = nullptr; 379+ 380+ /** 381+ * Initial list array. 382+ * Avoids some heap allocations, and list is never nullptr. 383+ * Increases the object size a bit. 384+ */ 385+ UChar32 stackList[INITIAL_CAPACITY]; 386 387-private: 388- enum { // constants 389- kIsBogus = 1 // This set is bogus (i.e. not valid) 390- }; 391- uint8_t fFlags; // Bit flag (see constants above) 392 public: 393 /** 394 * Determine if this object contains a valid set. 395@@ -1480,8 +1492,6 @@ private: 396 397 friend class USetAccess; 398 399- int32_t getStringCount() const; 400- 401 const UnicodeString* getString(int32_t index) const; 402 403 //---------------------------------------------------------------- 404@@ -1528,13 +1538,18 @@ private: 405 // Implementation: Utility methods 406 //---------------------------------------------------------------- 407 408- void ensureCapacity(int32_t newLen, UErrorCode& ec); 409+ static int32_t nextCapacity(int32_t minCapacity); 410+ 411+ bool ensureCapacity(int32_t newLen); 412 413- void ensureBufferCapacity(int32_t newLen, UErrorCode& ec); 414+ bool ensureBufferCapacity(int32_t newLen); 415 416 void swapBuffers(void); 417 418 UBool allocateStrings(UErrorCode &status); 419+ UBool hasStrings() const; 420+ int32_t stringsSize() const; 421+ UBool stringsContains(const UnicodeString &s) const; 422 423 UnicodeString& _toPattern(UnicodeString& result, 424 UBool escapeUnprintable) const; 425@@ -1614,7 +1629,6 @@ private: 426 UnicodeString& rebuiltPat, 427 UErrorCode& ec); 428 429- friend class CharacterProperties; 430 static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status); 431 432 /** 433@@ -1646,7 +1660,10 @@ private: 434 /** 435 * Set the new pattern to cache. 436 */ 437- void setPattern(const UnicodeString& newPat); 438+ void setPattern(const UnicodeString& newPat) { 439+ setPattern(newPat.getBuffer(), newPat.length()); 440+ } 441+ void setPattern(const char16_t *newPat, int32_t newPatLen); 442 /** 443 * Release existing cached pattern. 444 */ 445diff --git a/source/common/uniset.cpp b/source/common/uniset.cpp 446index e8378e0a..20242776 100644 447--- a/source/common/uniset.cpp 448+++ b/source/common/uniset.cpp 449@@ -14,6 +14,7 @@ 450 #include "unicode/parsepos.h" 451 #include "unicode/symtable.h" 452 #include "unicode/uniset.h" 453+#include "unicode/ustring.h" 454 #include "unicode/utf8.h" 455 #include "unicode/utf16.h" 456 #include "ruleiter.h" 457@@ -53,11 +54,8 @@ 458 // LOW <= all valid values. ZERO for codepoints 459 #define UNICODESET_LOW 0x000000 460 461-// initial storage. Must be >= 0 462-#define START_EXTRA 16 463- 464-// extra amount for growth. Must be >= 0 465-#define GROW_EXTRA START_EXTRA 466+/** Max list [0, 1, 2, ..., max code point, HIGH] */ 467+constexpr int32_t MAX_LENGTH = UNICODESET_HIGH + 1; 468 469 U_NAMESPACE_BEGIN 470 471@@ -137,6 +135,18 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { 472 return a.compare(b); 473 } 474 475+UBool UnicodeSet::hasStrings() const { 476+ return strings != nullptr && !strings->isEmpty(); 477+} 478+ 479+int32_t UnicodeSet::stringsSize() const { 480+ return strings == nullptr ? 0 : strings->size(); 481+} 482+ 483+UBool UnicodeSet::stringsContains(const UnicodeString &s) const { 484+ return strings != nullptr && strings->contains((void*) &s); 485+} 486+ 487 //---------------------------------------------------------------- 488 // Constructors &c 489 //---------------------------------------------------------------- 490@@ -144,24 +154,8 @@ static int8_t U_CALLCONV compareUnicodeString(UElement t1, UElement t2) { 491 /** 492 * Constructs an empty set. 493 */ 494-UnicodeSet::UnicodeSet() : 495- len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), 496- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 497- fFlags(0) 498-{ 499- UErrorCode status = U_ZERO_ERROR; 500- allocateStrings(status); 501- if (U_FAILURE(status)) { 502- setToBogus(); // If memory allocation failed, set to bogus state. 503- return; 504- } 505- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 506- if(list!=NULL){ 507- list[0] = UNICODESET_HIGH; 508- } else { // If memory allocation failed, set to bogus state. 509- setToBogus(); 510- return; 511- } 512+UnicodeSet::UnicodeSet() { 513+ list[0] = UNICODESET_HIGH; 514 _dbgct(this); 515 } 516 517@@ -172,89 +166,39 @@ UnicodeSet::UnicodeSet() : 518 * @param start first character, inclusive, of range 519 * @param end last character, inclusive, of range 520 */ 521-UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) : 522- len(1), capacity(1 + START_EXTRA), list(0), bmpSet(0), buffer(0), 523- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 524- fFlags(0) 525-{ 526- UErrorCode status = U_ZERO_ERROR; 527- allocateStrings(status); 528- if (U_FAILURE(status)) { 529- setToBogus(); // If memory allocation failed, set to bogus state. 530- return; 531- } 532- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 533- if(list!=NULL){ 534- list[0] = UNICODESET_HIGH; 535- complement(start, end); 536- } else { // If memory allocation failed, set to bogus state. 537- setToBogus(); 538- return; 539- } 540+UnicodeSet::UnicodeSet(UChar32 start, UChar32 end) { 541+ list[0] = UNICODESET_HIGH; 542+ add(start, end); 543 _dbgct(this); 544 } 545 546 /** 547 * Constructs a set that is identical to the given UnicodeSet. 548 */ 549-UnicodeSet::UnicodeSet(const UnicodeSet& o) : 550- UnicodeFilter(o), 551- len(0), capacity(o.isFrozen() ? o.len : o.len + GROW_EXTRA), list(0), 552- bmpSet(0), 553- buffer(0), bufferCapacity(0), 554- patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 555- fFlags(0) 556-{ 557- UErrorCode status = U_ZERO_ERROR; 558- allocateStrings(status); 559- if (U_FAILURE(status)) { 560- setToBogus(); // If memory allocation failed, set to bogus state. 561- return; 562- } 563- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 564- if(list!=NULL){ 565- *this = o; 566- } else { // If memory allocation failed, set to bogus state. 567- setToBogus(); 568- return; 569- } 570+UnicodeSet::UnicodeSet(const UnicodeSet& o) : UnicodeFilter(o) { 571+ *this = o; 572 _dbgct(this); 573 } 574 575 // Copy-construct as thawed. 576-UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : 577- UnicodeFilter(o), 578- len(0), capacity(o.len + GROW_EXTRA), list(0), 579- bmpSet(0), 580- buffer(0), bufferCapacity(0), 581- patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 582- fFlags(0) 583-{ 584- UErrorCode status = U_ZERO_ERROR; 585- allocateStrings(status); 586- if (U_FAILURE(status)) { 587- setToBogus(); // If memory allocation failed, set to bogus state. 588- return; 589- } 590- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 591- if(list!=NULL){ 592+UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : UnicodeFilter(o) { 593+ if (ensureCapacity(o.len)) { 594 // *this = o except for bmpSet and stringSpan 595 len = o.len; 596 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); 597- if (strings != NULL && o.strings != NULL) { 598- strings->assign(*o.strings, cloneUnicodeString, status); 599- } else { // Invalid strings. 600- setToBogus(); 601- return; 602+ if (o.hasStrings()) { 603+ UErrorCode status = U_ZERO_ERROR; 604+ if (!allocateStrings(status) || 605+ (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { 606+ setToBogus(); 607+ return; 608+ } 609 } 610 if (o.pat) { 611- setPattern(UnicodeString(o.pat, o.patLen)); 612+ setPattern(o.pat, o.patLen); 613 } 614- } else { // If memory allocation failed, set to bogus state. 615- setToBogus(); 616- return; 617+ _dbgct(this); 618 } 619- _dbgct(this); 620 } 621 622 /** 623@@ -262,9 +206,11 @@ UnicodeSet::UnicodeSet(const UnicodeSet& o, UBool /* asThawed */) : 624 */ 625 UnicodeSet::~UnicodeSet() { 626 _dbgdt(this); // first! 627- uprv_free(list); 628+ if (list != stackList) { 629+ uprv_free(list); 630+ } 631 delete bmpSet; 632- if (buffer) { 633+ if (buffer != stackList) { 634 uprv_free(buffer); 635 } 636 delete strings; 637@@ -290,32 +236,30 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { 638 setToBogus(); 639 return *this; 640 } 641- UErrorCode ec = U_ZERO_ERROR; 642- ensureCapacity(o.len, ec); 643- if (U_FAILURE(ec)) { 644+ if (!ensureCapacity(o.len)) { 645 // ensureCapacity will mark the UnicodeSet as Bogus if OOM failure happens. 646 return *this; 647 } 648 len = o.len; 649 uprv_memcpy(list, o.list, (size_t)len*sizeof(UChar32)); 650- if (o.bmpSet == NULL || asThawed) { 651- bmpSet = NULL; 652- } else { 653+ if (o.bmpSet != nullptr && !asThawed) { 654 bmpSet = new BMPSet(*o.bmpSet, list, len); 655 if (bmpSet == NULL) { // Check for memory allocation error. 656 setToBogus(); 657 return *this; 658 } 659 } 660- if (strings != NULL && o.strings != NULL) { 661- strings->assign(*o.strings, cloneUnicodeString, ec); 662- } else { // Invalid strings. 663- setToBogus(); 664- return *this; 665+ if (o.hasStrings()) { 666+ UErrorCode status = U_ZERO_ERROR; 667+ if ((strings == nullptr && !allocateStrings(status)) || 668+ (strings->assign(*o.strings, cloneUnicodeString, status), U_FAILURE(status))) { 669+ setToBogus(); 670+ return *this; 671+ } 672+ } else if (hasStrings()) { 673+ strings->removeAllElements(); 674 } 675- if (o.stringSpan == NULL || asThawed) { 676- stringSpan = NULL; 677- } else { 678+ if (o.stringSpan != nullptr && !asThawed) { 679 stringSpan = new UnicodeSetStringSpan(*o.stringSpan, *strings); 680 if (stringSpan == NULL) { // Check for memory allocation error. 681 setToBogus(); 682@@ -324,7 +268,7 @@ UnicodeSet& UnicodeSet::copyFrom(const UnicodeSet& o, UBool asThawed) { 683 } 684 releasePattern(); 685 if (o.pat) { 686- setPattern(UnicodeString(o.pat, o.patLen)); 687+ setPattern(o.pat, o.patLen); 688 } 689 return *this; 690 } 691@@ -357,7 +301,8 @@ UBool UnicodeSet::operator==(const UnicodeSet& o) const { 692 for (int32_t i = 0; i < len; ++i) { 693 if (list[i] != o.list[i]) return FALSE; 694 } 695- if (*strings != *o.strings) return FALSE; 696+ if (hasStrings() != o.hasStrings()) { return FALSE; } 697+ if (hasStrings() && *strings != *o.strings) return FALSE; 698 return TRUE; 699 } 700 701@@ -393,7 +338,7 @@ int32_t UnicodeSet::size(void) const { 702 for (int32_t i = 0; i < count; ++i) { 703 n += getRangeEnd(i) - getRangeStart(i) + 1; 704 } 705- return n + strings->size(); 706+ return n + stringsSize(); 707 } 708 709 /** 710@@ -402,7 +347,7 @@ int32_t UnicodeSet::size(void) const { 711 * @return <tt>true</tt> if this set contains no elements. 712 */ 713 UBool UnicodeSet::isEmpty(void) const { 714- return len == 1 && strings->size() == 0; 715+ return len == 1 && !hasStrings(); 716 } 717 718 /** 719@@ -502,7 +447,7 @@ UBool UnicodeSet::contains(const UnicodeString& s) const { 720 if (s.length() == 0) return FALSE; 721 int32_t cp = getSingleCP(s); 722 if (cp < 0) { 723- return strings->contains((void*) &s); 724+ return stringsContains(s); 725 } else { 726 return contains((UChar32) cp); 727 } 728@@ -524,8 +469,7 @@ UBool UnicodeSet::containsAll(const UnicodeSet& c) const { 729 return FALSE; 730 } 731 } 732- if (!strings->containsAll(*c.strings)) return FALSE; 733- return TRUE; 734+ return !c.hasStrings() || (strings != nullptr && strings->containsAll(*c.strings)); 735 } 736 737 /** 738@@ -571,8 +515,7 @@ UBool UnicodeSet::containsNone(const UnicodeSet& c) const { 739 return FALSE; 740 } 741 } 742- if (!strings->containsNone(*c.strings)) return FALSE; 743- return TRUE; 744+ return strings == nullptr || !c.hasStrings() || strings->containsNone(*c.strings); 745 } 746 747 /** 748@@ -613,7 +556,7 @@ UBool UnicodeSet::matchesIndexValue(uint8_t v) const { 749 return TRUE; 750 } 751 } 752- if (strings->size() != 0) { 753+ if (hasStrings()) { 754 for (i=0; i<strings->size(); ++i) { 755 const UnicodeString& s = *(const UnicodeString*)strings->elementAt(i); 756 //if (s.length() == 0) { 757@@ -648,7 +591,7 @@ UMatchDegree UnicodeSet::matches(const Replaceable& text, 758 return U_MISMATCH; 759 } 760 } else { 761- if (strings->size() != 0) { // try strings first 762+ if (hasStrings()) { // try strings first 763 764 // might separate forward and backward loops later 765 // for now they are combined 766@@ -849,7 +792,39 @@ UnicodeSet& UnicodeSet::set(UChar32 start, UChar32 end) { 767 */ 768 UnicodeSet& UnicodeSet::add(UChar32 start, UChar32 end) { 769 if (pinCodePoint(start) < pinCodePoint(end)) { 770- UChar32 range[3] = { start, end+1, UNICODESET_HIGH }; 771+ UChar32 limit = end + 1; 772+ // Fast path for adding a new range after the last one. 773+ // Odd list length: [..., lastStart, lastLimit, HIGH] 774+ if ((len & 1) != 0) { 775+ // If the list is empty, set lastLimit low enough to not be adjacent to 0. 776+ UChar32 lastLimit = len == 1 ? -2 : list[len - 2]; 777+ if (lastLimit <= start && !isFrozen() && !isBogus()) { 778+ if (lastLimit == start) { 779+ // Extend the last range. 780+ list[len - 2] = limit; 781+ if (limit == UNICODESET_HIGH) { 782+ --len; 783+ } 784+ } else { 785+ list[len - 1] = start; 786+ if (limit < UNICODESET_HIGH) { 787+ if (ensureCapacity(len + 2)) { 788+ list[len++] = limit; 789+ list[len++] = UNICODESET_HIGH; 790+ } 791+ } else { // limit == UNICODESET_HIGH 792+ if (ensureCapacity(len + 1)) { 793+ list[len++] = UNICODESET_HIGH; 794+ } 795+ } 796+ } 797+ releasePattern(); 798+ return *this; 799+ } 800+ } 801+ // This is slow. Could be much faster using findCodePoint(start) 802+ // and modifying the list, dealing with adjacent & overlapping ranges. 803+ UChar32 range[3] = { start, limit, UNICODESET_HIGH }; 804 add(range, 2, 0); 805 } else if (start == end) { 806 add(start); 807@@ -918,9 +893,7 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { 808 list[i] = c; 809 // if we touched the HIGH mark, then add a new one 810 if (c == (UNICODESET_HIGH - 1)) { 811- UErrorCode status = U_ZERO_ERROR; 812- ensureCapacity(len+1, status); 813- if (U_FAILURE(status)) { 814+ if (!ensureCapacity(len+1)) { 815 // ensureCapacity will mark the object as Bogus if OOM failure happens. 816 return *this; 817 } 818@@ -964,21 +937,13 @@ UnicodeSet& UnicodeSet::add(UChar32 c) { 819 // ^ 820 // list[i] 821 822- UErrorCode status = U_ZERO_ERROR; 823- ensureCapacity(len+2, status); 824- if (U_FAILURE(status)) { 825+ if (!ensureCapacity(len+2)) { 826 // ensureCapacity will mark the object as Bogus if OOM failure happens. 827 return *this; 828 } 829 830- //for (int32_t k=len-1; k>=i; --k) { 831- // list[k+2] = list[k]; 832- //} 833- UChar32* src = list + len; 834- UChar32* dst = src + 2; 835- UChar32* srclimit = list + i; 836- while (src > srclimit) *(--dst) = *(--src); 837- 838+ UChar32 *p = list + i; 839+ uprv_memmove(p + 2, p, (len - i) * sizeof(*p)); 840 list[i] = c; 841 list[i+1] = c+1; 842 len += 2; 843@@ -1014,7 +979,7 @@ UnicodeSet& UnicodeSet::add(const UnicodeString& s) { 844 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 845 int32_t cp = getSingleCP(s); 846 if (cp < 0) { 847- if (!strings->contains((void*) &s)) { 848+ if (!stringsContains(s)) { 849 _add(s); 850 releasePattern(); 851 } 852@@ -1033,12 +998,16 @@ void UnicodeSet::_add(const UnicodeString& s) { 853 if (isFrozen() || isBogus()) { 854 return; 855 } 856+ UErrorCode ec = U_ZERO_ERROR; 857+ if (strings == nullptr && !allocateStrings(ec)) { 858+ setToBogus(); 859+ return; 860+ } 861 UnicodeString* t = new UnicodeString(s); 862 if (t == NULL) { // Check for memory allocation error. 863 setToBogus(); 864 return; 865 } 866- UErrorCode ec = U_ZERO_ERROR; 867 strings->sortedInsert(t, compareUnicodeString, ec); 868 if (U_FAILURE(ec)) { 869 setToBogus(); 870@@ -1121,7 +1090,10 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeString& s) { 871 } 872 873 UnicodeSet& UnicodeSet::removeAllStrings() { 874- strings->removeAllElements(); 875+ if (!isFrozen() && hasStrings()) { 876+ strings->removeAllElements(); 877+ releasePattern(); 878+ } 879 return *this; 880 } 881 882@@ -1217,8 +1189,9 @@ UnicodeSet& UnicodeSet::remove(const UnicodeString& s) { 883 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 884 int32_t cp = getSingleCP(s); 885 if (cp < 0) { 886- strings->removeElement((void*) &s); 887- releasePattern(); 888+ if (strings != nullptr && strings->removeElement((void*) &s)) { 889+ releasePattern(); 890+ } 891 } else { 892 remove((UChar32)cp, (UChar32)cp); 893 } 894@@ -1260,24 +1233,17 @@ UnicodeSet& UnicodeSet::complement(void) { 895 if (isFrozen() || isBogus()) { 896 return *this; 897 } 898- UErrorCode status = U_ZERO_ERROR; 899 if (list[0] == UNICODESET_LOW) { 900- ensureBufferCapacity(len-1, status); 901- if (U_FAILURE(status)) { 902- return *this; 903- } 904- uprv_memcpy(buffer, list + 1, (size_t)(len-1)*sizeof(UChar32)); 905+ uprv_memmove(list, list + 1, (size_t)(len-1)*sizeof(UChar32)); 906 --len; 907 } else { 908- ensureBufferCapacity(len+1, status); 909- if (U_FAILURE(status)) { 910+ if (!ensureCapacity(len+1)) { 911 return *this; 912 } 913- uprv_memcpy(buffer + 1, list, (size_t)len*sizeof(UChar32)); 914- buffer[0] = UNICODESET_LOW; 915+ uprv_memmove(list + 1, list, (size_t)len*sizeof(UChar32)); 916+ list[0] = UNICODESET_LOW; 917 ++len; 918 } 919- swapBuffers(); 920 releasePattern(); 921 return *this; 922 } 923@@ -1294,7 +1260,7 @@ UnicodeSet& UnicodeSet::complement(const UnicodeString& s) { 924 if (s.length() == 0 || isFrozen() || isBogus()) return *this; 925 int32_t cp = getSingleCP(s); 926 if (cp < 0) { 927- if (strings->contains((void*) &s)) { 928+ if (stringsContains(s)) { 929 strings->removeElement((void*) &s); 930 } else { 931 _add(s); 932@@ -1325,7 +1291,7 @@ UnicodeSet& UnicodeSet::addAll(const UnicodeSet& c) { 933 if ( c.strings!=NULL ) { 934 for (int32_t i=0; i<c.strings->size(); ++i) { 935 const UnicodeString* s = (const UnicodeString*)c.strings->elementAt(i); 936- if (!strings->contains((void*) s)) { 937+ if (!stringsContains(*s)) { 938 _add(*s); 939 } 940 } 941@@ -1347,7 +1313,13 @@ UnicodeSet& UnicodeSet::retainAll(const UnicodeSet& c) { 942 return *this; 943 } 944 retain(c.list, c.len, 0); 945- strings->retainAll(*c.strings); 946+ if (hasStrings()) { 947+ if (!c.hasStrings()) { 948+ strings->removeAllElements(); 949+ } else { 950+ strings->retainAll(*c.strings); 951+ } 952+ } 953 return *this; 954 } 955 956@@ -1365,7 +1337,9 @@ UnicodeSet& UnicodeSet::removeAll(const UnicodeSet& c) { 957 return *this; 958 } 959 retain(c.list, c.len, 2); 960- strings->removeAll(*c.strings); 961+ if (hasStrings() && c.hasStrings()) { 962+ strings->removeAll(*c.strings); 963+ } 964 return *this; 965 } 966 967@@ -1383,10 +1357,12 @@ UnicodeSet& UnicodeSet::complementAll(const UnicodeSet& c) { 968 } 969 exclusiveOr(c.list, c.len, 0); 970 971- for (int32_t i=0; i<c.strings->size(); ++i) { 972- void* e = c.strings->elementAt(i); 973- if (!strings->removeElement(e)) { 974- _add(*(const UnicodeString*)e); 975+ if (c.strings != nullptr) { 976+ for (int32_t i=0; i<c.strings->size(); ++i) { 977+ void* e = c.strings->elementAt(i); 978+ if (strings == nullptr || !strings->removeElement(e)) { 979+ _add(*(const UnicodeString*)e); 980+ } 981 } 982 } 983 return *this; 984@@ -1400,18 +1376,14 @@ UnicodeSet& UnicodeSet::clear(void) { 985 if (isFrozen()) { 986 return *this; 987 } 988- if (list != NULL) { 989- list[0] = UNICODESET_HIGH; 990- } 991+ list[0] = UNICODESET_HIGH; 992 len = 1; 993 releasePattern(); 994 if (strings != NULL) { 995 strings->removeAllElements(); 996 } 997- if (list != NULL && strings != NULL) { 998- // Remove bogus 999- fFlags = 0; 1000- } 1001+ // Remove bogus 1002+ fFlags = 0; 1003 return *this; 1004 } 1005 1006@@ -1445,10 +1417,6 @@ UChar32 UnicodeSet::getRangeEnd(int32_t index) const { 1007 return list[index*2 + 1] - 1; 1008 } 1009 1010-int32_t UnicodeSet::getStringCount() const { 1011- return strings->size(); 1012-} 1013- 1014 const UnicodeString* UnicodeSet::getString(int32_t index) const { 1015 return (const UnicodeString*) strings->elementAt(index); 1016 } 1017@@ -1462,22 +1430,32 @@ UnicodeSet& UnicodeSet::compact() { 1018 return *this; 1019 } 1020 // Delete buffer first to defragment memory less. 1021- if (buffer != NULL) { 1022+ if (buffer != stackList) { 1023 uprv_free(buffer); 1024 buffer = NULL; 1025- } 1026- if (len < capacity) { 1027- // Make the capacity equal to len or 1. 1028- // We don't want to realloc of 0 size. 1029- int32_t newCapacity = len + (len == 0); 1030- UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * newCapacity); 1031+ bufferCapacity = 0; 1032+ } 1033+ if (list == stackList) { 1034+ // pass 1035+ } else if (len <= INITIAL_CAPACITY) { 1036+ uprv_memcpy(stackList, list, len * sizeof(UChar32)); 1037+ uprv_free(list); 1038+ list = stackList; 1039+ capacity = INITIAL_CAPACITY; 1040+ } else if ((len + 7) < capacity) { 1041+ // If we have more than a little unused capacity, shrink it to len. 1042+ UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * len); 1043 if (temp) { 1044 list = temp; 1045- capacity = newCapacity; 1046+ capacity = len; 1047 } 1048 // else what the heck happened?! We allocated less memory! 1049 // Oh well. We'll keep our original array. 1050 } 1051+ if (strings != nullptr && strings->isEmpty()) { 1052+ delete strings; 1053+ strings = nullptr; 1054+ } 1055 return *this; 1056 } 1057 1058@@ -1488,10 +1466,8 @@ UnicodeSet& UnicodeSet::compact() { 1059 /** 1060 * Deserialize constructor. 1061 */ 1062-UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, UErrorCode &ec) 1063- : len(1), capacity(1+START_EXTRA), list(0), bmpSet(0), buffer(0), 1064- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1065- fFlags(0) { 1066+UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization serialization, 1067+ UErrorCode &ec) { 1068 1069 if(U_FAILURE(ec)) { 1070 setToBogus(); 1071@@ -1506,24 +1482,15 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se 1072 return; 1073 } 1074 1075- allocateStrings(ec); 1076- if (U_FAILURE(ec)) { 1077- setToBogus(); 1078- return; 1079- } 1080- 1081 // bmp? 1082 int32_t headerSize = ((data[0]&0x8000)) ?2:1; 1083 int32_t bmpLength = (headerSize==1)?data[0]:data[1]; 1084 1085- len = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; 1086+ int32_t newLength = (((data[0]&0x7FFF)-bmpLength)/2)+bmpLength; 1087 #ifdef DEBUG_SERIALIZE 1088- printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,len, data[0],data[1],data[2],data[3]); 1089+ printf("dataLen %d headerSize %d bmpLen %d len %d. data[0]=%X/%X/%X/%X\n", dataLen,headerSize,bmpLength,newLength, data[0],data[1],data[2],data[3]); 1090 #endif 1091- capacity = len+1; 1092- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1093- if(!list || U_FAILURE(ec)) { 1094- setToBogus(); 1095+ if(!ensureCapacity(newLength + 1)) { // +1 for HIGH 1096 return; 1097 } 1098 // copy bmp 1099@@ -1535,15 +1502,18 @@ UnicodeSet::UnicodeSet(const uint16_t data[], int32_t dataLen, ESerialization se 1100 #endif 1101 } 1102 // copy smp 1103- for(i=bmpLength;i<len;i++) { 1104+ for(i=bmpLength;i<newLength;i++) { 1105 list[i] = ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+0] << 16) + 1106 ((UChar32)data[headerSize+bmpLength+(i-bmpLength)*2+1]); 1107 #ifdef DEBUG_SERIALIZE 1108 printf("<<32@%d+[%d] %lX\n", headerSize+bmpLength+i, i, list[i]); 1109 #endif 1110 } 1111- // terminator 1112- list[len++]=UNICODESET_HIGH; 1113+ U_ASSERT(i == newLength); 1114+ if (i == 0 || list[i - 1] != UNICODESET_HIGH) { 1115+ list[i++] = UNICODESET_HIGH; 1116+ } 1117+ len = i; 1118 } 1119 1120 1121@@ -1664,33 +1634,65 @@ UBool UnicodeSet::allocateStrings(UErrorCode &status) { 1122 return TRUE; 1123 } 1124 1125-void UnicodeSet::ensureCapacity(int32_t newLen, UErrorCode& ec) { 1126+int32_t UnicodeSet::nextCapacity(int32_t minCapacity) { 1127+ // Grow exponentially to reduce the frequency of allocations. 1128+ if (minCapacity < INITIAL_CAPACITY) { 1129+ return minCapacity + INITIAL_CAPACITY; 1130+ } else if (minCapacity <= 2500) { 1131+ return 5 * minCapacity; 1132+ } else { 1133+ int32_t newCapacity = 2 * minCapacity; 1134+ if (newCapacity > MAX_LENGTH) { 1135+ newCapacity = MAX_LENGTH; 1136+ } 1137+ return newCapacity; 1138+ } 1139+} 1140+ 1141+bool UnicodeSet::ensureCapacity(int32_t newLen) { 1142+ if (newLen > MAX_LENGTH) { 1143+ newLen = MAX_LENGTH; 1144+ } 1145 if (newLen <= capacity) { 1146- return; 1147+ return true; 1148 } 1149- UChar32* temp = (UChar32*) uprv_realloc(list, sizeof(UChar32) * (newLen + GROW_EXTRA)); 1150+ int32_t newCapacity = nextCapacity(newLen); 1151+ UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); 1152 if (temp == NULL) { 1153- ec = U_MEMORY_ALLOCATION_ERROR; 1154 setToBogus(); // set the object to bogus state if an OOM failure occurred. 1155- return; 1156+ return false; 1157+ } 1158+ // Copy only the actual contents. 1159+ uprv_memcpy(temp, list, len * sizeof(UChar32)); 1160+ if (list != stackList) { 1161+ uprv_free(list); 1162 } 1163 list = temp; 1164- capacity = newLen + GROW_EXTRA; 1165- // else we keep the original contents on the memory failure. 1166+ capacity = newCapacity; 1167+ return true; 1168 } 1169 1170-void UnicodeSet::ensureBufferCapacity(int32_t newLen, UErrorCode& ec) { 1171- if (buffer != NULL && newLen <= bufferCapacity) 1172- return; 1173- UChar32* temp = (UChar32*) uprv_realloc(buffer, sizeof(UChar32) * (newLen + GROW_EXTRA)); 1174+bool UnicodeSet::ensureBufferCapacity(int32_t newLen) { 1175+ if (newLen > MAX_LENGTH) { 1176+ newLen = MAX_LENGTH; 1177+ } 1178+ if (newLen <= bufferCapacity) { 1179+ return true; 1180+ } 1181+ int32_t newCapacity = nextCapacity(newLen); 1182+ UChar32* temp = (UChar32*) uprv_malloc(newCapacity * sizeof(UChar32)); 1183 if (temp == NULL) { 1184- ec = U_MEMORY_ALLOCATION_ERROR; 1185 setToBogus(); 1186- return; 1187+ return false; 1188+ } 1189+ // The buffer has no contents to be copied. 1190+ // It is always filled from scratch after this call. 1191+ if (buffer != stackList) { 1192+ uprv_free(buffer); 1193 } 1194 buffer = temp; 1195- bufferCapacity = newLen + GROW_EXTRA; 1196- // else we keep the original contents on the memory failure. 1197+ bufferCapacity = newCapacity; 1198+ return true; 1199 } 1200 1201 /** 1202@@ -1727,9 +1729,7 @@ void UnicodeSet::exclusiveOr(const UChar32* other, int32_t otherLen, int8_t pola 1203 if (isFrozen() || isBogus()) { 1204 return; 1205 } 1206- UErrorCode status = U_ZERO_ERROR; 1207- ensureBufferCapacity(len + otherLen, status); 1208- if (U_FAILURE(status)) { 1209+ if (!ensureBufferCapacity(len + otherLen)) { 1210 return; 1211 } 1212 1213@@ -1777,9 +1777,7 @@ void UnicodeSet::add(const UChar32* other, int32_t otherLen, int8_t polarity) { 1214 if (isFrozen() || isBogus() || other==NULL) { 1215 return; 1216 } 1217- UErrorCode status = U_ZERO_ERROR; 1218- ensureBufferCapacity(len + otherLen, status); 1219- if (U_FAILURE(status)) { 1220+ if (!ensureBufferCapacity(len + otherLen)) { 1221 return; 1222 } 1223 1224@@ -1890,9 +1888,7 @@ void UnicodeSet::retain(const UChar32* other, int32_t otherLen, int8_t polarity) 1225 if (isFrozen() || isBogus()) { 1226 return; 1227 } 1228- UErrorCode status = U_ZERO_ERROR; 1229- ensureBufferCapacity(len + otherLen, status); 1230- if (U_FAILURE(status)) { 1231+ if (!ensureBufferCapacity(len + otherLen)) { 1232 return; 1233 } 1234 1235@@ -2138,12 +2134,14 @@ UnicodeString& UnicodeSet::_generatePattern(UnicodeString& result, 1236 } 1237 } 1238 1239- for (int32_t i = 0; i<strings->size(); ++i) { 1240- result.append(OPEN_BRACE); 1241- _appendToPat(result, 1242- *(const UnicodeString*) strings->elementAt(i), 1243- escapeUnprintable); 1244- result.append(CLOSE_BRACE); 1245+ if (strings != nullptr) { 1246+ for (int32_t i = 0; i<strings->size(); ++i) { 1247+ result.append(OPEN_BRACE); 1248+ _appendToPat(result, 1249+ *(const UnicodeString*) strings->elementAt(i), 1250+ escapeUnprintable); 1251+ result.append(CLOSE_BRACE); 1252+ } 1253 } 1254 return result.append(SET_CLOSE); 1255 } 1256@@ -2162,13 +2160,12 @@ void UnicodeSet::releasePattern() { 1257 /** 1258 * Set the new pattern to cache. 1259 */ 1260-void UnicodeSet::setPattern(const UnicodeString& newPat) { 1261+void UnicodeSet::setPattern(const char16_t *newPat, int32_t newPatLen) { 1262 releasePattern(); 1263- int32_t newPatLen = newPat.length(); 1264 pat = (UChar *)uprv_malloc((newPatLen + 1) * sizeof(UChar)); 1265 if (pat) { 1266 patLen = newPatLen; 1267- newPat.extractBetween(0, patLen, pat); 1268+ u_memcpy(pat, newPat, patLen); 1269 pat[patLen] = 0; 1270 } 1271 // else we don't care if malloc failed. This was just a nice cache. 1272@@ -2177,30 +2174,15 @@ void UnicodeSet::setPattern(const UnicodeString& newPat) { 1273 1274 UnicodeFunctor *UnicodeSet::freeze() { 1275 if(!isFrozen() && !isBogus()) { 1276- // Do most of what compact() does before freezing because 1277- // compact() will not work when the set is frozen. 1278- // Small modification: Don't shrink if the savings would be tiny (<=GROW_EXTRA). 1279- 1280- // Delete buffer first to defragment memory less. 1281- if (buffer != NULL) { 1282- uprv_free(buffer); 1283- buffer = NULL; 1284- } 1285- if (capacity > (len + GROW_EXTRA)) { 1286- // Make the capacity equal to len or 1. 1287- // We don't want to realloc of 0 size. 1288- capacity = len + (len == 0); 1289- list = (UChar32*) uprv_realloc(list, sizeof(UChar32) * capacity); 1290- if (list == NULL) { // Check for memory allocation error. 1291- setToBogus(); 1292- return this; 1293- } 1294- } 1295+ compact(); 1296 1297 // Optimize contains() and span() and similar functions. 1298- if (!strings->isEmpty()) { 1299+ if (hasStrings()) { 1300 stringSpan = new UnicodeSetStringSpan(*this, *strings, UnicodeSetStringSpan::ALL); 1301- if (stringSpan != NULL && !stringSpan->needsStringSpanUTF16()) { 1302+ if (stringSpan == nullptr) { 1303+ setToBogus(); 1304+ return this; 1305+ } else if (!stringSpan->needsStringSpanUTF16()) { 1306 // All strings are irrelevant for span() etc. because 1307 // all of each string's code points are contained in this set. 1308 // Do not check needsStringSpanUTF8() because UTF-8 has at most as 1309@@ -2233,7 +2215,7 @@ int32_t UnicodeSet::span(const UChar *s, int32_t length, USetSpanCondition spanC 1310 } 1311 if(stringSpan!=NULL) { 1312 return stringSpan->span(s, length, spanCondition); 1313- } else if(!strings->isEmpty()) { 1314+ } else if(hasStrings()) { 1315 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 1316 UnicodeSetStringSpan::FWD_UTF16_NOT_CONTAINED : 1317 UnicodeSetStringSpan::FWD_UTF16_CONTAINED; 1318@@ -2270,7 +2252,7 @@ int32_t UnicodeSet::spanBack(const UChar *s, int32_t length, USetSpanCondition s 1319 } 1320 if(stringSpan!=NULL) { 1321 return stringSpan->spanBack(s, length, spanCondition); 1322- } else if(!strings->isEmpty()) { 1323+ } else if(hasStrings()) { 1324 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 1325 UnicodeSetStringSpan::BACK_UTF16_NOT_CONTAINED : 1326 UnicodeSetStringSpan::BACK_UTF16_CONTAINED; 1327@@ -2308,7 +2290,7 @@ int32_t UnicodeSet::spanUTF8(const char *s, int32_t length, USetSpanCondition sp 1328 } 1329 if(stringSpan!=NULL) { 1330 return stringSpan->spanUTF8((const uint8_t *)s, length, spanCondition); 1331- } else if(!strings->isEmpty()) { 1332+ } else if(hasStrings()) { 1333 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 1334 UnicodeSetStringSpan::FWD_UTF8_NOT_CONTAINED : 1335 UnicodeSetStringSpan::FWD_UTF8_CONTAINED; 1336@@ -2346,7 +2328,7 @@ int32_t UnicodeSet::spanBackUTF8(const char *s, int32_t length, USetSpanConditio 1337 } 1338 if(stringSpan!=NULL) { 1339 return stringSpan->spanBackUTF8((const uint8_t *)s, length, spanCondition); 1340- } else if(!strings->isEmpty()) { 1341+ } else if(hasStrings()) { 1342 uint32_t which= spanCondition==USET_SPAN_NOT_CONTAINED ? 1343 UnicodeSetStringSpan::BACK_UTF8_NOT_CONTAINED : 1344 UnicodeSetStringSpan::BACK_UTF8_CONTAINED; 1345diff --git a/source/common/uniset_closure.cpp b/source/common/uniset_closure.cpp 1346index 0b7da796..882231ba 100644 1347--- a/source/common/uniset_closure.cpp 1348+++ b/source/common/uniset_closure.cpp 1349@@ -31,10 +31,6 @@ 1350 #include "util.h" 1351 #include "uvector.h" 1352 1353-// initial storage. Must be >= 0 1354-// *** same as in uniset.cpp ! *** 1355-#define START_EXTRA 16 1356- 1357 U_NAMESPACE_BEGIN 1358 1359 // TODO memory debugging provided inside uniset.cpp 1360@@ -49,42 +45,16 @@ U_NAMESPACE_BEGIN 1361 UnicodeSet::UnicodeSet(const UnicodeString& pattern, 1362 uint32_t options, 1363 const SymbolTable* symbols, 1364- UErrorCode& status) : 1365- len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 1366- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1367- fFlags(0) 1368-{ 1369- if(U_SUCCESS(status)){ 1370- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1371- /* test for NULL */ 1372- if(list == NULL) { 1373- status = U_MEMORY_ALLOCATION_ERROR; 1374- }else{ 1375- allocateStrings(status); 1376- applyPattern(pattern, options, symbols, status); 1377- } 1378- } 1379+ UErrorCode& status) { 1380+ applyPattern(pattern, options, symbols, status); 1381 _dbgct(this); 1382 } 1383 1384 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 1385 uint32_t options, 1386 const SymbolTable* symbols, 1387- UErrorCode& status) : 1388- len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 1389- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1390- fFlags(0) 1391-{ 1392- if(U_SUCCESS(status)){ 1393- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1394- /* test for NULL */ 1395- if(list == NULL) { 1396- status = U_MEMORY_ALLOCATION_ERROR; 1397- }else{ 1398- allocateStrings(status); 1399- applyPattern(pattern, pos, options, symbols, status); 1400- } 1401- } 1402+ UErrorCode& status) { 1403+ applyPattern(pattern, pos, options, symbols, status); 1404 _dbgct(this); 1405 } 1406 1407@@ -199,7 +169,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { 1408 // start with input set to guarantee inclusion 1409 // USET_CASE: remove strings because the strings will actually be reduced (folded); 1410 // therefore, start with no strings and add only those needed 1411- if (attribute & USET_CASE_INSENSITIVE) { 1412+ if ((attribute & USET_CASE_INSENSITIVE) && foldSet.hasStrings()) { 1413 foldSet.strings->removeAllElements(); 1414 } 1415 1416@@ -234,7 +204,7 @@ UnicodeSet& UnicodeSet::closeOver(int32_t attribute) { 1417 } 1418 } 1419 } 1420- if (strings != NULL && strings->size() > 0) { 1421+ if (hasStrings()) { 1422 if (attribute & USET_CASE_INSENSITIVE) { 1423 for (int32_t j=0; j<strings->size(); ++j) { 1424 str = *(const UnicodeString *) strings->elementAt(j); 1425diff --git a/source/common/uniset_props.cpp b/source/common/uniset_props.cpp 1426index 6cfd80a7..e98c175f 100644 1427--- a/source/common/uniset_props.cpp 1428+++ b/source/common/uniset_props.cpp 1429@@ -47,10 +47,6 @@ 1430 1431 U_NAMESPACE_USE 1432 1433-// initial storage. Must be >= 0 1434-// *** same as in uniset.cpp ! *** 1435-#define START_EXTRA 16 1436- 1437 // Define UChar constants using hex for EBCDIC compatibility 1438 // Used #define to reduce private static exports and memory access time. 1439 #define SET_OPEN ((UChar)0x005B) /*[*/ 1440@@ -185,21 +181,8 @@ isPOSIXClose(const UnicodeString &pattern, int32_t pos) { 1441 * @param pattern a string specifying what characters are in the set 1442 */ 1443 UnicodeSet::UnicodeSet(const UnicodeString& pattern, 1444- UErrorCode& status) : 1445- len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), 1446- bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), 1447- fFlags(0) 1448-{ 1449- if(U_SUCCESS(status)){ 1450- list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); 1451- /* test for NULL */ 1452- if(list == NULL) { 1453- status = U_MEMORY_ALLOCATION_ERROR; 1454- }else{ 1455- allocateStrings(status); 1456- applyPattern(pattern, status); 1457- } 1458- } 1459+ UErrorCode& status) { 1460+ applyPattern(pattern, status); 1461 _dbgct(this); 1462 } 1463 1464@@ -713,6 +696,11 @@ static UBool numericValueFilter(UChar32 ch, void* context) { 1465 return u_getNumericValue(ch) == *(double*)context; 1466 } 1467 1468+static UBool generalCategoryMaskFilter(UChar32 ch, void* context) { 1469+ int32_t value = *(int32_t*)context; 1470+ return (U_GET_GC_MASK((UChar32) ch) & value) != 0; 1471+} 1472+ 1473 static UBool versionFilter(UChar32 ch, void* context) { 1474 static const UVersionInfo none = { 0, 0, 0, 0 }; 1475 UVersionInfo v; 1476@@ -721,6 +709,16 @@ static UBool versionFilter(UChar32 ch, void* context) { 1477 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; 1478 } 1479 1480+typedef struct { 1481+ UProperty prop; 1482+ int32_t value; 1483+} IntPropertyContext; 1484+ 1485+static UBool intPropertyFilter(UChar32 ch, void* context) { 1486+ IntPropertyContext* c = (IntPropertyContext*)context; 1487+ return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value; 1488+} 1489+ 1490 static UBool scriptExtensionsFilter(UChar32 ch, void* context) { 1491 return uscript_hasScript(ch, *(UScriptCode*)context); 1492 } 1493@@ -781,43 +779,6 @@ void UnicodeSet::applyFilter(UnicodeSet::Filter filter, 1494 1495 namespace { 1496 1497-/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */ 1498-uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) { 1499- uint32_t mask = *(const uint32_t *)context; 1500- value = U_MASK(value) & mask; 1501- if (value != 0) { value = 1; } 1502- return value; 1503-} 1504- 1505-/** Maps one map value to 1, all others to 0. */ 1506-uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) { 1507- uint32_t v = *(const uint32_t *)context; 1508- return value == v ? 1 : 0; 1509-} 1510- 1511-} // namespace 1512- 1513-void UnicodeSet::applyIntPropertyValue(const UCPMap *map, 1514- UCPMapValueFilter *filter, const void *context, 1515- UErrorCode &errorCode) { 1516- if (U_FAILURE(errorCode)) { return; } 1517- clear(); 1518- UChar32 start = 0, end; 1519- uint32_t value; 1520- while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, 1521- filter, context, &value)) >= 0) { 1522- if (value != 0) { 1523- add(start, end); 1524- } 1525- start = end + 1; 1526- } 1527- if (isBogus()) { 1528- errorCode = U_MEMORY_ALLOCATION_ERROR; 1529- } 1530-} 1531- 1532-namespace { 1533- 1534 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 1535 /* Note: we use ' ' in compiler code page */ 1536 int32_t j = 0; 1537@@ -845,11 +806,10 @@ static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { 1538 1539 UnicodeSet& 1540 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { 1541- if (U_FAILURE(ec)) { return *this; } 1542- // All of the following check isFrozen() before modifying this set. 1543+ if (U_FAILURE(ec) || isFrozen()) { return *this; } 1544 if (prop == UCHAR_GENERAL_CATEGORY_MASK) { 1545- const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec); 1546- applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec); 1547+ const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); 1548+ applyFilter(generalCategoryMaskFilter, &value, inclusions, ec); 1549 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { 1550 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); 1551 UScriptCode script = (UScriptCode)value; 1552@@ -866,14 +826,11 @@ UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) 1553 clear(); 1554 } 1555 } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { 1556- const UCPMap *map = u_getIntPropertyMap(prop, &ec); 1557- applyIntPropertyValue(map, intValueFilter, &value, ec); 1558+ const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); 1559+ IntPropertyContext c = {prop, value}; 1560+ applyFilter(intPropertyFilter, &c, inclusions, ec); 1561 } else { 1562- // This code used to always call getInclusions(property source) 1563- // which sets an error for an unsupported property. 1564 ec = U_ILLEGAL_ARGUMENT_ERROR; 1565- // Otherwise we would just clear() this set because 1566- // getIntPropertyValue(c, prop) returns 0 for all code points. 1567 } 1568 return *this; 1569 } 1570diff --git a/source/common/uprops.h b/source/common/uprops.h 1571index 1a8e4e84..34b3600b 100644 1572--- a/source/common/uprops.h 1573+++ b/source/common/uprops.h 1574@@ -462,7 +462,6 @@ class UnicodeSet; 1575 class CharacterProperties { 1576 public: 1577 CharacterProperties() = delete; 1578- static void U_CALLCONV initInclusion(UPropertySource src, UErrorCode &errorCode); 1579 static const UnicodeSet *getInclusionsForProperty(UProperty prop, UErrorCode &errorCode); 1580 }; 1581 1582diff --git a/source/common/uset.cpp b/source/common/uset.cpp 1583index 39ad0a34..eae7981d 100644 1584--- a/source/common/uset.cpp 1585+++ b/source/common/uset.cpp 1586@@ -249,7 +249,7 @@ class USetAccess /* not : public UObject because all methods are static */ { 1587 public: 1588 /* Try to have the compiler inline these*/ 1589 inline static int32_t getStringCount(const UnicodeSet& set) { 1590- return set.getStringCount(); 1591+ return set.stringsSize(); 1592 } 1593 inline static const UnicodeString* getString(const UnicodeSet& set, 1594 int32_t i) { 1595diff --git a/source/common/usetiter.cpp b/source/common/usetiter.cpp 1596index 93048ba2..79151690 100644 1597--- a/source/common/usetiter.cpp 1598+++ b/source/common/usetiter.cpp 1599@@ -116,7 +116,7 @@ void UnicodeSetIterator::reset() { 1600 stringCount = 0; 1601 } else { 1602 endRange = set->getRangeCount() - 1; 1603- stringCount = set->strings->size(); 1604+ stringCount = set->stringsSize(); 1605 } 1606 range = 0; 1607 endElement = -1; 1608