1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ******************************************************************************
8 *
9 * File unistr.cpp
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 09/25/98 stephen Creation.
15 * 04/20/99 stephen Overhauled per 4/16 code review.
16 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
17 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
18 * Replaceable.
19 * 06/25/01 grhoten Removed the dependency on iostream
20 ******************************************************************************
21 */
22
23 #include "unicode/utypes.h"
24 #include "unicode/appendable.h"
25 #include "unicode/putil.h"
26 #include "cstring.h"
27 #include "cmemory.h"
28 #include "unicode/ustring.h"
29 #include "unicode/unistr.h"
30 #include "unicode/utf.h"
31 #include "unicode/utf16.h"
32 #include "uelement.h"
33 #include "ustr_imp.h"
34 #include "umutex.h"
35 #include "uassert.h"
36
37 #if 0
38
39 #include <iostream>
40 using namespace std;
41
42 //DEBUGGING
43 void
44 print(const UnicodeString& s,
45 const char *name)
46 {
47 UChar c;
48 cout << name << ":|";
49 for(int i = 0; i < s.length(); ++i) {
50 c = s[i];
51 if(c>= 0x007E || c < 0x0020)
52 cout << "[0x" << hex << s[i] << "]";
53 else
54 cout << (char) s[i];
55 }
56 cout << '|' << endl;
57 }
58
59 void
60 print(const UChar *s,
61 int32_t len,
62 const char *name)
63 {
64 UChar c;
65 cout << name << ":|";
66 for(int i = 0; i < len; ++i) {
67 c = s[i];
68 if(c>= 0x007E || c < 0x0020)
69 cout << "[0x" << hex << s[i] << "]";
70 else
71 cout << (char) s[i];
72 }
73 cout << '|' << endl;
74 }
75 // END DEBUGGING
76 #endif
77
78 // Local function definitions for now
79
80 // need to copy areas that may overlap
81 static
82 inline void
us_arrayCopy(const UChar * src,int32_t srcStart,UChar * dst,int32_t dstStart,int32_t count)83 us_arrayCopy(const UChar *src, int32_t srcStart,
84 UChar *dst, int32_t dstStart, int32_t count)
85 {
86 if(count>0) {
87 uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
88 }
89 }
90
91 // u_unescapeAt() callback to get a UChar from a UnicodeString
92 U_CDECL_BEGIN
93 static UChar U_CALLCONV
UnicodeString_charAt(int32_t offset,void * context)94 UnicodeString_charAt(int32_t offset, void *context) {
95 return ((icu::UnicodeString*) context)->charAt(offset);
96 }
97 U_CDECL_END
98
99 U_NAMESPACE_BEGIN
100
101 /* The Replaceable virtual destructor can't be defined in the header
102 due to how AIX works with multiple definitions of virtual functions.
103 */
~Replaceable()104 Replaceable::~Replaceable() {}
105
106 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
107
108 UnicodeString U_EXPORT2
109 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
110 return
111 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
112 append(s1).
113 append(s2);
114 }
115
116 //========================================
117 // Reference Counting functions, put at top of file so that optimizing compilers
118 // have a chance to automatically inline.
119 //========================================
120
121 void
addRef()122 UnicodeString::addRef() {
123 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
124 }
125
126 int32_t
removeRef()127 UnicodeString::removeRef() {
128 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
129 }
130
131 int32_t
refCount() const132 UnicodeString::refCount() const {
133 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
134 }
135
136 void
releaseArray()137 UnicodeString::releaseArray() {
138 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
139 uprv_free((int32_t *)fUnion.fFields.fArray - 1);
140 }
141 }
142
143
144
145 //========================================
146 // Constructors
147 //========================================
148
149 // The default constructor is inline in unistr.h.
150
UnicodeString(int32_t capacity,UChar32 c,int32_t count)151 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
152 fUnion.fFields.fLengthAndFlags = 0;
153 if(count <= 0 || (uint32_t)c > 0x10ffff) {
154 // just allocate and do not do anything else
155 allocate(capacity);
156 } else if(c <= 0xffff) {
157 int32_t length = count;
158 if(capacity < length) {
159 capacity = length;
160 }
161 if(allocate(capacity)) {
162 UChar *array = getArrayStart();
163 UChar unit = (UChar)c;
164 for(int32_t i = 0; i < length; ++i) {
165 array[i] = unit;
166 }
167 setLength(length);
168 }
169 } else { // supplementary code point, write surrogate pairs
170 if(count > (INT32_MAX / 2)) {
171 // We would get more than 2G UChars.
172 allocate(capacity);
173 return;
174 }
175 int32_t length = count * 2;
176 if(capacity < length) {
177 capacity = length;
178 }
179 if(allocate(capacity)) {
180 UChar *array = getArrayStart();
181 UChar lead = U16_LEAD(c);
182 UChar trail = U16_TRAIL(c);
183 for(int32_t i = 0; i < length; i += 2) {
184 array[i] = lead;
185 array[i + 1] = trail;
186 }
187 setLength(length);
188 }
189 }
190 }
191
UnicodeString(UChar ch)192 UnicodeString::UnicodeString(UChar ch) {
193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194 fUnion.fStackFields.fBuffer[0] = ch;
195 }
196
UnicodeString(UChar32 ch)197 UnicodeString::UnicodeString(UChar32 ch) {
198 fUnion.fFields.fLengthAndFlags = kShortString;
199 int32_t i = 0;
200 UBool isError = FALSE;
201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202 // We test isError so that the compiler does not complain that we don't.
203 // If isError then i==0 which is what we want anyway.
204 if(!isError) {
205 setShortLength(i);
206 }
207 }
208
UnicodeString(const UChar * text)209 UnicodeString::UnicodeString(const UChar *text) {
210 fUnion.fFields.fLengthAndFlags = kShortString;
211 doAppend(text, 0, -1);
212 }
213
UnicodeString(const UChar * text,int32_t textLength)214 UnicodeString::UnicodeString(const UChar *text,
215 int32_t textLength) {
216 fUnion.fFields.fLengthAndFlags = kShortString;
217 doAppend(text, 0, textLength);
218 }
219
UnicodeString(UBool isTerminated,ConstChar16Ptr textPtr,int32_t textLength)220 UnicodeString::UnicodeString(UBool isTerminated,
221 ConstChar16Ptr textPtr,
222 int32_t textLength) {
223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224 const UChar *text = textPtr;
225 if(text == NULL) {
226 // treat as an empty string, do not alias
227 setToEmpty();
228 } else if(textLength < -1 ||
229 (textLength == -1 && !isTerminated) ||
230 (textLength >= 0 && isTerminated && text[textLength] != 0)
231 ) {
232 setToBogus();
233 } else {
234 if(textLength == -1) {
235 // text is terminated, or else it would have failed the above test
236 textLength = u_strlen(text);
237 }
238 setArray(const_cast<UChar *>(text), textLength,
239 isTerminated ? textLength + 1 : textLength);
240 }
241 }
242
UnicodeString(UChar * buff,int32_t buffLength,int32_t buffCapacity)243 UnicodeString::UnicodeString(UChar *buff,
244 int32_t buffLength,
245 int32_t buffCapacity) {
246 fUnion.fFields.fLengthAndFlags = kWritableAlias;
247 if(buff == NULL) {
248 // treat as an empty string, do not alias
249 setToEmpty();
250 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
251 setToBogus();
252 } else {
253 if(buffLength == -1) {
254 // fLength = u_strlen(buff); but do not look beyond buffCapacity
255 const UChar *p = buff, *limit = buff + buffCapacity;
256 while(p != limit && *p != 0) {
257 ++p;
258 }
259 buffLength = (int32_t)(p - buff);
260 }
261 setArray(buff, buffLength, buffCapacity);
262 }
263 }
264
UnicodeString(const char * src,int32_t length,EInvariant)265 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
266 fUnion.fFields.fLengthAndFlags = kShortString;
267 if(src==NULL) {
268 // treat as an empty string
269 } else {
270 if(length<0) {
271 length=(int32_t)uprv_strlen(src);
272 }
273 if(cloneArrayIfNeeded(length, length, FALSE)) {
274 u_charsToUChars(src, getArrayStart(), length);
275 setLength(length);
276 } else {
277 setToBogus();
278 }
279 }
280 }
281
282 #if U_CHARSET_IS_UTF8
283
UnicodeString(const char * codepageData)284 UnicodeString::UnicodeString(const char *codepageData) {
285 fUnion.fFields.fLengthAndFlags = kShortString;
286 if(codepageData != 0) {
287 setToUTF8(codepageData);
288 }
289 }
290
UnicodeString(const char * codepageData,int32_t dataLength)291 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
292 fUnion.fFields.fLengthAndFlags = kShortString;
293 // if there's nothing to convert, do nothing
294 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
295 return;
296 }
297 if(dataLength == -1) {
298 dataLength = (int32_t)uprv_strlen(codepageData);
299 }
300 setToUTF8(StringPiece(codepageData, dataLength));
301 }
302
303 // else see unistr_cnv.cpp
304 #endif
305
UnicodeString(const UnicodeString & that)306 UnicodeString::UnicodeString(const UnicodeString& that) {
307 fUnion.fFields.fLengthAndFlags = kShortString;
308 copyFrom(that);
309 }
310
UnicodeString(UnicodeString && src)311 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
312 copyFieldsFrom(src, TRUE);
313 }
314
UnicodeString(const UnicodeString & that,int32_t srcStart)315 UnicodeString::UnicodeString(const UnicodeString& that,
316 int32_t srcStart) {
317 fUnion.fFields.fLengthAndFlags = kShortString;
318 setTo(that, srcStart);
319 }
320
UnicodeString(const UnicodeString & that,int32_t srcStart,int32_t srcLength)321 UnicodeString::UnicodeString(const UnicodeString& that,
322 int32_t srcStart,
323 int32_t srcLength) {
324 fUnion.fFields.fLengthAndFlags = kShortString;
325 setTo(that, srcStart, srcLength);
326 }
327
328 // Replaceable base class clone() default implementation, does not clone
329 Replaceable *
clone() const330 Replaceable::clone() const {
331 return NULL;
332 }
333
334 // UnicodeString overrides clone() with a real implementation
335 UnicodeString *
clone() const336 UnicodeString::clone() const {
337 return new UnicodeString(*this);
338 }
339
340 //========================================
341 // array allocation
342 //========================================
343
344 namespace {
345
346 const int32_t kGrowSize = 128;
347
348 // The number of bytes for one int32_t reference counter and capacity UChars
349 // must fit into a 32-bit size_t (at least when on a 32-bit platform).
350 // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
351 // and round up to a multiple of 16 bytes.
352 // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
353 // (With more complicated checks we could go up to 0x7ffffffd without rounding up,
354 // but that does not seem worth it.)
355 const int32_t kMaxCapacity = 0x7ffffff5;
356
getGrowCapacity(int32_t newLength)357 int32_t getGrowCapacity(int32_t newLength) {
358 int32_t growSize = (newLength >> 2) + kGrowSize;
359 if(growSize <= (kMaxCapacity - newLength)) {
360 return newLength + growSize;
361 } else {
362 return kMaxCapacity;
363 }
364 }
365
366 } // namespace
367
368 UBool
allocate(int32_t capacity)369 UnicodeString::allocate(int32_t capacity) {
370 if(capacity <= US_STACKBUF_SIZE) {
371 fUnion.fFields.fLengthAndFlags = kShortString;
372 return TRUE;
373 }
374 if(capacity <= kMaxCapacity) {
375 ++capacity; // for the NUL
376 // Switch to size_t which is unsigned so that we can allocate up to 4GB.
377 // Reference counter + UChars.
378 size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
379 // Round up to a multiple of 16.
380 numBytes = (numBytes + 15) & ~15;
381 int32_t *array = (int32_t *) uprv_malloc(numBytes);
382 if(array != NULL) {
383 // set initial refCount and point behind the refCount
384 *array++ = 1;
385 numBytes -= sizeof(int32_t);
386
387 // have fArray point to the first UChar
388 fUnion.fFields.fArray = (UChar *)array;
389 fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
390 fUnion.fFields.fLengthAndFlags = kLongString;
391 return TRUE;
392 }
393 }
394 fUnion.fFields.fLengthAndFlags = kIsBogus;
395 fUnion.fFields.fArray = 0;
396 fUnion.fFields.fCapacity = 0;
397 return FALSE;
398 }
399
400 //========================================
401 // Destructor
402 //========================================
403
404 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
405 static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
406 static u_atomic_int32_t beyondCount(0);
407
unistr_printLengths()408 U_CAPI void unistr_printLengths() {
409 int32_t i;
410 for(i = 0; i <= 59; ++i) {
411 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
412 }
413 int32_t beyond = beyondCount;
414 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
415 beyond += finalLengthCounts[i];
416 }
417 printf(">59, %9d\n", beyond);
418 }
419 #endif
420
~UnicodeString()421 UnicodeString::~UnicodeString()
422 {
423 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
424 // Count lengths of strings at the end of their lifetime.
425 // Useful for discussion of a desirable stack buffer size.
426 // Count the contents length, not the optional NUL terminator nor further capacity.
427 // Ignore open-buffer strings and strings which alias external storage.
428 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
429 if(hasShortLength()) {
430 umtx_atomic_inc(finalLengthCounts + getShortLength());
431 } else {
432 umtx_atomic_inc(&beyondCount);
433 }
434 }
435 #endif
436
437 releaseArray();
438 }
439
440 //========================================
441 // Factory methods
442 //========================================
443
fromUTF8(StringPiece utf8)444 UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
445 UnicodeString result;
446 result.setToUTF8(utf8);
447 return result;
448 }
449
fromUTF32(const UChar32 * utf32,int32_t length)450 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
451 UnicodeString result;
452 int32_t capacity;
453 // Most UTF-32 strings will be BMP-only and result in a same-length
454 // UTF-16 string. We overestimate the capacity just slightly,
455 // just in case there are a few supplementary characters.
456 if(length <= US_STACKBUF_SIZE) {
457 capacity = US_STACKBUF_SIZE;
458 } else {
459 capacity = length + (length >> 4) + 4;
460 }
461 do {
462 UChar *utf16 = result.getBuffer(capacity);
463 int32_t length16;
464 UErrorCode errorCode = U_ZERO_ERROR;
465 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
466 utf32, length,
467 0xfffd, // Substitution character.
468 NULL, // Don't care about number of substitutions.
469 &errorCode);
470 result.releaseBuffer(length16);
471 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
472 capacity = length16 + 1; // +1 for the terminating NUL.
473 continue;
474 } else if(U_FAILURE(errorCode)) {
475 result.setToBogus();
476 }
477 break;
478 } while(TRUE);
479 return result;
480 }
481
482 //========================================
483 // Assignment
484 //========================================
485
486 UnicodeString &
operator =(const UnicodeString & src)487 UnicodeString::operator=(const UnicodeString &src) {
488 return copyFrom(src);
489 }
490
491 UnicodeString &
fastCopyFrom(const UnicodeString & src)492 UnicodeString::fastCopyFrom(const UnicodeString &src) {
493 return copyFrom(src, TRUE);
494 }
495
496 UnicodeString &
copyFrom(const UnicodeString & src,UBool fastCopy)497 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
498 // if assigning to ourselves, do nothing
499 if(this == &src) {
500 return *this;
501 }
502
503 // is the right side bogus?
504 if(src.isBogus()) {
505 setToBogus();
506 return *this;
507 }
508
509 // delete the current contents
510 releaseArray();
511
512 if(src.isEmpty()) {
513 // empty string - use the stack buffer
514 setToEmpty();
515 return *this;
516 }
517
518 // fLength>0 and not an "open" src.getBuffer(minCapacity)
519 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
520 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
521 case kShortString:
522 // short string using the stack buffer, do the same
523 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
524 getShortLength() * U_SIZEOF_UCHAR);
525 break;
526 case kLongString:
527 // src uses a refCounted string buffer, use that buffer with refCount
528 // src is const, use a cast - we don't actually change it
529 ((UnicodeString &)src).addRef();
530 // copy all fields, share the reference-counted buffer
531 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
532 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
533 if(!hasShortLength()) {
534 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
535 }
536 break;
537 case kReadonlyAlias:
538 if(fastCopy) {
539 // src is a readonly alias, do the same
540 // -> maintain the readonly alias as such
541 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
542 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
543 if(!hasShortLength()) {
544 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
545 }
546 break;
547 }
548 // else if(!fastCopy) fall through to case kWritableAlias
549 // -> allocate a new buffer and copy the contents
550 U_FALLTHROUGH;
551 case kWritableAlias: {
552 // src is a writable alias; we make a copy of that instead
553 int32_t srcLength = src.length();
554 if(allocate(srcLength)) {
555 u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
556 setLength(srcLength);
557 break;
558 }
559 // if there is not enough memory, then fall through to setting to bogus
560 U_FALLTHROUGH;
561 }
562 default:
563 // if src is bogus, set ourselves to bogus
564 // do not call setToBogus() here because fArray and flags are not consistent here
565 fUnion.fFields.fLengthAndFlags = kIsBogus;
566 fUnion.fFields.fArray = 0;
567 fUnion.fFields.fCapacity = 0;
568 break;
569 }
570
571 return *this;
572 }
573
operator =(UnicodeString && src)574 UnicodeString &UnicodeString::operator=(UnicodeString &&src) U_NOEXCEPT {
575 // No explicit check for self move assignment, consistent with standard library.
576 // Self move assignment causes no crash nor leak but might make the object bogus.
577 releaseArray();
578 copyFieldsFrom(src, TRUE);
579 return *this;
580 }
581
582 // Same as move assignment except without memory management.
copyFieldsFrom(UnicodeString & src,UBool setSrcToBogus)583 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
584 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
585 if(lengthAndFlags & kUsingStackBuffer) {
586 // Short string using the stack buffer, copy the contents.
587 // Check for self assignment to prevent "overlap in memcpy" warnings,
588 // although it should be harmless to copy a buffer to itself exactly.
589 if(this != &src) {
590 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
591 getShortLength() * U_SIZEOF_UCHAR);
592 }
593 } else {
594 // In all other cases, copy all fields.
595 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
596 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
597 if(!hasShortLength()) {
598 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
599 }
600 if(setSrcToBogus) {
601 // Set src to bogus without releasing any memory.
602 src.fUnion.fFields.fLengthAndFlags = kIsBogus;
603 src.fUnion.fFields.fArray = NULL;
604 src.fUnion.fFields.fCapacity = 0;
605 }
606 }
607 }
608
swap(UnicodeString & other)609 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
610 UnicodeString temp; // Empty short string: Known not to need releaseArray().
611 // Copy fields without resetting source values in between.
612 temp.copyFieldsFrom(*this, FALSE);
613 this->copyFieldsFrom(other, FALSE);
614 other.copyFieldsFrom(temp, FALSE);
615 // Set temp to an empty string so that other's memory is not released twice.
616 temp.fUnion.fFields.fLengthAndFlags = kShortString;
617 }
618
619 //========================================
620 // Miscellaneous operations
621 //========================================
622
unescape() const623 UnicodeString UnicodeString::unescape() const {
624 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
625 if (result.isBogus()) {
626 return result;
627 }
628 const UChar *array = getBuffer();
629 int32_t len = length();
630 int32_t prev = 0;
631 for (int32_t i=0;;) {
632 if (i == len) {
633 result.append(array, prev, len - prev);
634 break;
635 }
636 if (array[i++] == 0x5C /*'\\'*/) {
637 result.append(array, prev, (i - 1) - prev);
638 UChar32 c = unescapeAt(i); // advances i
639 if (c < 0) {
640 result.remove(); // return empty string
641 break; // invalid escape sequence
642 }
643 result.append(c);
644 prev = i;
645 }
646 }
647 return result;
648 }
649
unescapeAt(int32_t & offset) const650 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
651 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
652 }
653
654 //========================================
655 // Read-only implementation
656 //========================================
657 UBool
doEquals(const UnicodeString & text,int32_t len) const658 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
659 // Requires: this & text not bogus and have same lengths.
660 // Byte-wise comparison works for equality regardless of endianness.
661 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
662 }
663
664 int8_t
doCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const665 UnicodeString::doCompare( int32_t start,
666 int32_t length,
667 const UChar *srcChars,
668 int32_t srcStart,
669 int32_t srcLength) const
670 {
671 // compare illegal string values
672 if(isBogus()) {
673 return -1;
674 }
675
676 // pin indices to legal values
677 pinIndices(start, length);
678
679 if(srcChars == NULL) {
680 // treat const UChar *srcChars==NULL as an empty string
681 return length == 0 ? 0 : 1;
682 }
683
684 // get the correct pointer
685 const UChar *chars = getArrayStart();
686
687 chars += start;
688 srcChars += srcStart;
689
690 int32_t minLength;
691 int8_t lengthResult;
692
693 // get the srcLength if necessary
694 if(srcLength < 0) {
695 srcLength = u_strlen(srcChars + srcStart);
696 }
697
698 // are we comparing different lengths?
699 if(length != srcLength) {
700 if(length < srcLength) {
701 minLength = length;
702 lengthResult = -1;
703 } else {
704 minLength = srcLength;
705 lengthResult = 1;
706 }
707 } else {
708 minLength = length;
709 lengthResult = 0;
710 }
711
712 /*
713 * note that uprv_memcmp() returns an int but we return an int8_t;
714 * we need to take care not to truncate the result -
715 * one way to do this is to right-shift the value to
716 * move the sign bit into the lower 8 bits and making sure that this
717 * does not become 0 itself
718 */
719
720 if(minLength > 0 && chars != srcChars) {
721 int32_t result;
722
723 # if U_IS_BIG_ENDIAN
724 // big-endian: byte comparison works
725 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
726 if(result != 0) {
727 return (int8_t)(result >> 15 | 1);
728 }
729 # else
730 // little-endian: compare UChar units
731 do {
732 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
733 if(result != 0) {
734 return (int8_t)(result >> 15 | 1);
735 }
736 } while(--minLength > 0);
737 # endif
738 }
739 return lengthResult;
740 }
741
742 /* String compare in code point order - doCompare() compares in code unit order. */
743 int8_t
doCompareCodePointOrder(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const744 UnicodeString::doCompareCodePointOrder(int32_t start,
745 int32_t length,
746 const UChar *srcChars,
747 int32_t srcStart,
748 int32_t srcLength) const
749 {
750 // compare illegal string values
751 // treat const UChar *srcChars==NULL as an empty string
752 if(isBogus()) {
753 return -1;
754 }
755
756 // pin indices to legal values
757 pinIndices(start, length);
758
759 if(srcChars == NULL) {
760 srcStart = srcLength = 0;
761 }
762
763 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
764 /* translate the 32-bit result into an 8-bit one */
765 if(diff!=0) {
766 return (int8_t)(diff >> 15 | 1);
767 } else {
768 return 0;
769 }
770 }
771
772 int32_t
getLength() const773 UnicodeString::getLength() const {
774 return length();
775 }
776
777 UChar
getCharAt(int32_t offset) const778 UnicodeString::getCharAt(int32_t offset) const {
779 return charAt(offset);
780 }
781
782 UChar32
getChar32At(int32_t offset) const783 UnicodeString::getChar32At(int32_t offset) const {
784 return char32At(offset);
785 }
786
787 UChar32
char32At(int32_t offset) const788 UnicodeString::char32At(int32_t offset) const
789 {
790 int32_t len = length();
791 if((uint32_t)offset < (uint32_t)len) {
792 const UChar *array = getArrayStart();
793 UChar32 c;
794 U16_GET(array, 0, offset, len, c);
795 return c;
796 } else {
797 return kInvalidUChar;
798 }
799 }
800
801 int32_t
getChar32Start(int32_t offset) const802 UnicodeString::getChar32Start(int32_t offset) const {
803 if((uint32_t)offset < (uint32_t)length()) {
804 const UChar *array = getArrayStart();
805 U16_SET_CP_START(array, 0, offset);
806 return offset;
807 } else {
808 return 0;
809 }
810 }
811
812 int32_t
getChar32Limit(int32_t offset) const813 UnicodeString::getChar32Limit(int32_t offset) const {
814 int32_t len = length();
815 if((uint32_t)offset < (uint32_t)len) {
816 const UChar *array = getArrayStart();
817 U16_SET_CP_LIMIT(array, 0, offset, len);
818 return offset;
819 } else {
820 return len;
821 }
822 }
823
824 int32_t
countChar32(int32_t start,int32_t length) const825 UnicodeString::countChar32(int32_t start, int32_t length) const {
826 pinIndices(start, length);
827 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
828 return u_countChar32(getArrayStart()+start, length);
829 }
830
831 UBool
hasMoreChar32Than(int32_t start,int32_t length,int32_t number) const832 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
833 pinIndices(start, length);
834 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
835 return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
836 }
837
838 int32_t
moveIndex32(int32_t index,int32_t delta) const839 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
840 // pin index
841 int32_t len = length();
842 if(index<0) {
843 index=0;
844 } else if(index>len) {
845 index=len;
846 }
847
848 const UChar *array = getArrayStart();
849 if(delta>0) {
850 U16_FWD_N(array, index, len, delta);
851 } else {
852 U16_BACK_N(array, 0, index, -delta);
853 }
854
855 return index;
856 }
857
858 void
doExtract(int32_t start,int32_t length,UChar * dst,int32_t dstStart) const859 UnicodeString::doExtract(int32_t start,
860 int32_t length,
861 UChar *dst,
862 int32_t dstStart) const
863 {
864 // pin indices to legal values
865 pinIndices(start, length);
866
867 // do not copy anything if we alias dst itself
868 const UChar *array = getArrayStart();
869 if(array + start != dst + dstStart) {
870 us_arrayCopy(array, start, dst, dstStart, length);
871 }
872 }
873
874 int32_t
extract(Char16Ptr dest,int32_t destCapacity,UErrorCode & errorCode) const875 UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
876 UErrorCode &errorCode) const {
877 int32_t len = length();
878 if(U_SUCCESS(errorCode)) {
879 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
880 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
881 } else {
882 const UChar *array = getArrayStart();
883 if(len>0 && len<=destCapacity && array!=dest) {
884 u_memcpy(dest, array, len);
885 }
886 return u_terminateUChars(dest, destCapacity, len, &errorCode);
887 }
888 }
889
890 return len;
891 }
892
893 int32_t
extract(int32_t start,int32_t length,char * target,int32_t targetCapacity,enum EInvariant) const894 UnicodeString::extract(int32_t start,
895 int32_t length,
896 char *target,
897 int32_t targetCapacity,
898 enum EInvariant) const
899 {
900 // if the arguments are illegal, then do nothing
901 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
902 return 0;
903 }
904
905 // pin the indices to legal values
906 pinIndices(start, length);
907
908 if(length <= targetCapacity) {
909 u_UCharsToChars(getArrayStart() + start, target, length);
910 }
911 UErrorCode status = U_ZERO_ERROR;
912 return u_terminateChars(target, targetCapacity, length, &status);
913 }
914
915 UnicodeString
tempSubString(int32_t start,int32_t len) const916 UnicodeString::tempSubString(int32_t start, int32_t len) const {
917 pinIndices(start, len);
918 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
919 if(array==NULL) {
920 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string
921 len=-2; // bogus result string
922 }
923 return UnicodeString(FALSE, array + start, len);
924 }
925
926 int32_t
toUTF8(int32_t start,int32_t len,char * target,int32_t capacity) const927 UnicodeString::toUTF8(int32_t start, int32_t len,
928 char *target, int32_t capacity) const {
929 pinIndices(start, len);
930 int32_t length8;
931 UErrorCode errorCode = U_ZERO_ERROR;
932 u_strToUTF8WithSub(target, capacity, &length8,
933 getBuffer() + start, len,
934 0xFFFD, // Standard substitution character.
935 NULL, // Don't care about number of substitutions.
936 &errorCode);
937 return length8;
938 }
939
940 #if U_CHARSET_IS_UTF8
941
942 int32_t
extract(int32_t start,int32_t len,char * target,uint32_t dstSize) const943 UnicodeString::extract(int32_t start, int32_t len,
944 char *target, uint32_t dstSize) const {
945 // if the arguments are illegal, then do nothing
946 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
947 return 0;
948 }
949 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
950 }
951
952 // else see unistr_cnv.cpp
953 #endif
954
955 void
extractBetween(int32_t start,int32_t limit,UnicodeString & target) const956 UnicodeString::extractBetween(int32_t start,
957 int32_t limit,
958 UnicodeString& target) const {
959 pinIndex(start);
960 pinIndex(limit);
961 doExtract(start, limit - start, target);
962 }
963
964 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
965 // as many bytes as the source has UChars.
966 // The "worst cases" are writing systems like Indic, Thai and CJK with
967 // 3:1 bytes:UChars.
968 void
toUTF8(ByteSink & sink) const969 UnicodeString::toUTF8(ByteSink &sink) const {
970 int32_t length16 = length();
971 if(length16 != 0) {
972 char stackBuffer[1024];
973 int32_t capacity = (int32_t)sizeof(stackBuffer);
974 UBool utf8IsOwned = FALSE;
975 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
976 3*length16,
977 stackBuffer, capacity,
978 &capacity);
979 int32_t length8 = 0;
980 UErrorCode errorCode = U_ZERO_ERROR;
981 u_strToUTF8WithSub(utf8, capacity, &length8,
982 getBuffer(), length16,
983 0xFFFD, // Standard substitution character.
984 NULL, // Don't care about number of substitutions.
985 &errorCode);
986 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
987 utf8 = (char *)uprv_malloc(length8);
988 if(utf8 != NULL) {
989 utf8IsOwned = TRUE;
990 errorCode = U_ZERO_ERROR;
991 u_strToUTF8WithSub(utf8, length8, &length8,
992 getBuffer(), length16,
993 0xFFFD, // Standard substitution character.
994 NULL, // Don't care about number of substitutions.
995 &errorCode);
996 } else {
997 errorCode = U_MEMORY_ALLOCATION_ERROR;
998 }
999 }
1000 if(U_SUCCESS(errorCode)) {
1001 sink.Append(utf8, length8);
1002 sink.Flush();
1003 }
1004 if(utf8IsOwned) {
1005 uprv_free(utf8);
1006 }
1007 }
1008 }
1009
1010 int32_t
toUTF32(UChar32 * utf32,int32_t capacity,UErrorCode & errorCode) const1011 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1012 int32_t length32=0;
1013 if(U_SUCCESS(errorCode)) {
1014 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1015 u_strToUTF32WithSub(utf32, capacity, &length32,
1016 getBuffer(), length(),
1017 0xfffd, // Substitution character.
1018 NULL, // Don't care about number of substitutions.
1019 &errorCode);
1020 }
1021 return length32;
1022 }
1023
1024 int32_t
indexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const1025 UnicodeString::indexOf(const UChar *srcChars,
1026 int32_t srcStart,
1027 int32_t srcLength,
1028 int32_t start,
1029 int32_t length) const
1030 {
1031 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1032 return -1;
1033 }
1034
1035 // UnicodeString does not find empty substrings
1036 if(srcLength < 0 && srcChars[srcStart] == 0) {
1037 return -1;
1038 }
1039
1040 // get the indices within bounds
1041 pinIndices(start, length);
1042
1043 // find the first occurrence of the substring
1044 const UChar *array = getArrayStart();
1045 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1046 if(match == NULL) {
1047 return -1;
1048 } else {
1049 return (int32_t)(match - array);
1050 }
1051 }
1052
1053 int32_t
doIndexOf(UChar c,int32_t start,int32_t length) const1054 UnicodeString::doIndexOf(UChar c,
1055 int32_t start,
1056 int32_t length) const
1057 {
1058 // pin indices
1059 pinIndices(start, length);
1060
1061 // find the first occurrence of c
1062 const UChar *array = getArrayStart();
1063 const UChar *match = u_memchr(array + start, c, length);
1064 if(match == NULL) {
1065 return -1;
1066 } else {
1067 return (int32_t)(match - array);
1068 }
1069 }
1070
1071 int32_t
doIndexOf(UChar32 c,int32_t start,int32_t length) const1072 UnicodeString::doIndexOf(UChar32 c,
1073 int32_t start,
1074 int32_t length) const {
1075 // pin indices
1076 pinIndices(start, length);
1077
1078 // find the first occurrence of c
1079 const UChar *array = getArrayStart();
1080 const UChar *match = u_memchr32(array + start, c, length);
1081 if(match == NULL) {
1082 return -1;
1083 } else {
1084 return (int32_t)(match - array);
1085 }
1086 }
1087
1088 int32_t
lastIndexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const1089 UnicodeString::lastIndexOf(const UChar *srcChars,
1090 int32_t srcStart,
1091 int32_t srcLength,
1092 int32_t start,
1093 int32_t length) const
1094 {
1095 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1096 return -1;
1097 }
1098
1099 // UnicodeString does not find empty substrings
1100 if(srcLength < 0 && srcChars[srcStart] == 0) {
1101 return -1;
1102 }
1103
1104 // get the indices within bounds
1105 pinIndices(start, length);
1106
1107 // find the last occurrence of the substring
1108 const UChar *array = getArrayStart();
1109 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1110 if(match == NULL) {
1111 return -1;
1112 } else {
1113 return (int32_t)(match - array);
1114 }
1115 }
1116
1117 int32_t
doLastIndexOf(UChar c,int32_t start,int32_t length) const1118 UnicodeString::doLastIndexOf(UChar c,
1119 int32_t start,
1120 int32_t length) const
1121 {
1122 if(isBogus()) {
1123 return -1;
1124 }
1125
1126 // pin indices
1127 pinIndices(start, length);
1128
1129 // find the last occurrence of c
1130 const UChar *array = getArrayStart();
1131 const UChar *match = u_memrchr(array + start, c, length);
1132 if(match == NULL) {
1133 return -1;
1134 } else {
1135 return (int32_t)(match - array);
1136 }
1137 }
1138
1139 int32_t
doLastIndexOf(UChar32 c,int32_t start,int32_t length) const1140 UnicodeString::doLastIndexOf(UChar32 c,
1141 int32_t start,
1142 int32_t length) const {
1143 // pin indices
1144 pinIndices(start, length);
1145
1146 // find the last occurrence of c
1147 const UChar *array = getArrayStart();
1148 const UChar *match = u_memrchr32(array + start, c, length);
1149 if(match == NULL) {
1150 return -1;
1151 } else {
1152 return (int32_t)(match - array);
1153 }
1154 }
1155
1156 //========================================
1157 // Write implementation
1158 //========================================
1159
1160 UnicodeString&
findAndReplace(int32_t start,int32_t length,const UnicodeString & oldText,int32_t oldStart,int32_t oldLength,const UnicodeString & newText,int32_t newStart,int32_t newLength)1161 UnicodeString::findAndReplace(int32_t start,
1162 int32_t length,
1163 const UnicodeString& oldText,
1164 int32_t oldStart,
1165 int32_t oldLength,
1166 const UnicodeString& newText,
1167 int32_t newStart,
1168 int32_t newLength)
1169 {
1170 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1171 return *this;
1172 }
1173
1174 pinIndices(start, length);
1175 oldText.pinIndices(oldStart, oldLength);
1176 newText.pinIndices(newStart, newLength);
1177
1178 if(oldLength == 0) {
1179 return *this;
1180 }
1181
1182 while(length > 0 && length >= oldLength) {
1183 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1184 if(pos < 0) {
1185 // no more oldText's here: done
1186 break;
1187 } else {
1188 // we found oldText, replace it by newText and go beyond it
1189 replace(pos, oldLength, newText, newStart, newLength);
1190 length -= pos + oldLength - start;
1191 start = pos + newLength;
1192 }
1193 }
1194
1195 return *this;
1196 }
1197
1198
1199 void
setToBogus()1200 UnicodeString::setToBogus()
1201 {
1202 releaseArray();
1203
1204 fUnion.fFields.fLengthAndFlags = kIsBogus;
1205 fUnion.fFields.fArray = 0;
1206 fUnion.fFields.fCapacity = 0;
1207 }
1208
1209 // turn a bogus string into an empty one
1210 void
unBogus()1211 UnicodeString::unBogus() {
1212 if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1213 setToEmpty();
1214 }
1215 }
1216
1217 const char16_t *
getTerminatedBuffer()1218 UnicodeString::getTerminatedBuffer() {
1219 if(!isWritable()) {
1220 return nullptr;
1221 }
1222 UChar *array = getArrayStart();
1223 int32_t len = length();
1224 if(len < getCapacity()) {
1225 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1226 // If len<capacity on a read-only alias, then array[len] is
1227 // either the original NUL (if constructed with (TRUE, s, length))
1228 // or one of the original string contents characters (if later truncated),
1229 // therefore we can assume that array[len] is initialized memory.
1230 if(array[len] == 0) {
1231 return array;
1232 }
1233 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1234 // kRefCounted: Do not write the NUL if the buffer is shared.
1235 // That is mostly safe, except when the length of one copy was modified
1236 // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1237 // Then the NUL would be written into the middle of another copy's string.
1238
1239 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1240 // Do not test if there is a NUL already because it might be uninitialized memory.
1241 // (That would be safe, but tools like valgrind & Purify would complain.)
1242 array[len] = 0;
1243 return array;
1244 }
1245 }
1246 if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1247 array = getArrayStart();
1248 array[len] = 0;
1249 return array;
1250 } else {
1251 return nullptr;
1252 }
1253 }
1254
1255 // setTo() analogous to the readonly-aliasing constructor with the same signature
1256 UnicodeString &
setTo(UBool isTerminated,ConstChar16Ptr textPtr,int32_t textLength)1257 UnicodeString::setTo(UBool isTerminated,
1258 ConstChar16Ptr textPtr,
1259 int32_t textLength)
1260 {
1261 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1262 // do not modify a string that has an "open" getBuffer(minCapacity)
1263 return *this;
1264 }
1265
1266 const UChar *text = textPtr;
1267 if(text == NULL) {
1268 // treat as an empty string, do not alias
1269 releaseArray();
1270 setToEmpty();
1271 return *this;
1272 }
1273
1274 if( textLength < -1 ||
1275 (textLength == -1 && !isTerminated) ||
1276 (textLength >= 0 && isTerminated && text[textLength] != 0)
1277 ) {
1278 setToBogus();
1279 return *this;
1280 }
1281
1282 releaseArray();
1283
1284 if(textLength == -1) {
1285 // text is terminated, or else it would have failed the above test
1286 textLength = u_strlen(text);
1287 }
1288 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1289 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1290 return *this;
1291 }
1292
1293 // setTo() analogous to the writable-aliasing constructor with the same signature
1294 UnicodeString &
setTo(UChar * buffer,int32_t buffLength,int32_t buffCapacity)1295 UnicodeString::setTo(UChar *buffer,
1296 int32_t buffLength,
1297 int32_t buffCapacity) {
1298 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1299 // do not modify a string that has an "open" getBuffer(minCapacity)
1300 return *this;
1301 }
1302
1303 if(buffer == NULL) {
1304 // treat as an empty string, do not alias
1305 releaseArray();
1306 setToEmpty();
1307 return *this;
1308 }
1309
1310 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1311 setToBogus();
1312 return *this;
1313 } else if(buffLength == -1) {
1314 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1315 const UChar *p = buffer, *limit = buffer + buffCapacity;
1316 while(p != limit && *p != 0) {
1317 ++p;
1318 }
1319 buffLength = (int32_t)(p - buffer);
1320 }
1321
1322 releaseArray();
1323
1324 fUnion.fFields.fLengthAndFlags = kWritableAlias;
1325 setArray(buffer, buffLength, buffCapacity);
1326 return *this;
1327 }
1328
setToUTF8(StringPiece utf8)1329 UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1330 unBogus();
1331 int32_t length = utf8.length();
1332 int32_t capacity;
1333 // The UTF-16 string will be at most as long as the UTF-8 string.
1334 if(length <= US_STACKBUF_SIZE) {
1335 capacity = US_STACKBUF_SIZE;
1336 } else {
1337 capacity = length + 1; // +1 for the terminating NUL.
1338 }
1339 UChar *utf16 = getBuffer(capacity);
1340 int32_t length16;
1341 UErrorCode errorCode = U_ZERO_ERROR;
1342 u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1343 utf8.data(), length,
1344 0xfffd, // Substitution character.
1345 NULL, // Don't care about number of substitutions.
1346 &errorCode);
1347 releaseBuffer(length16);
1348 if(U_FAILURE(errorCode)) {
1349 setToBogus();
1350 }
1351 return *this;
1352 }
1353
1354 UnicodeString&
setCharAt(int32_t offset,UChar c)1355 UnicodeString::setCharAt(int32_t offset,
1356 UChar c)
1357 {
1358 int32_t len = length();
1359 if(cloneArrayIfNeeded() && len > 0) {
1360 if(offset < 0) {
1361 offset = 0;
1362 } else if(offset >= len) {
1363 offset = len - 1;
1364 }
1365
1366 getArrayStart()[offset] = c;
1367 }
1368 return *this;
1369 }
1370
1371 UnicodeString&
replace(int32_t start,int32_t _length,UChar32 srcChar)1372 UnicodeString::replace(int32_t start,
1373 int32_t _length,
1374 UChar32 srcChar) {
1375 UChar buffer[U16_MAX_LENGTH];
1376 int32_t count = 0;
1377 UBool isError = FALSE;
1378 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1379 // We test isError so that the compiler does not complain that we don't.
1380 // If isError (srcChar is not a valid code point) then count==0 which means
1381 // we remove the source segment rather than replacing it with srcChar.
1382 return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1383 }
1384
1385 UnicodeString&
append(UChar32 srcChar)1386 UnicodeString::append(UChar32 srcChar) {
1387 UChar buffer[U16_MAX_LENGTH];
1388 int32_t _length = 0;
1389 UBool isError = FALSE;
1390 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1391 // We test isError so that the compiler does not complain that we don't.
1392 // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1393 return isError ? *this : doAppend(buffer, 0, _length);
1394 }
1395
1396 UnicodeString&
doReplace(int32_t start,int32_t length,const UnicodeString & src,int32_t srcStart,int32_t srcLength)1397 UnicodeString::doReplace( int32_t start,
1398 int32_t length,
1399 const UnicodeString& src,
1400 int32_t srcStart,
1401 int32_t srcLength)
1402 {
1403 // pin the indices to legal values
1404 src.pinIndices(srcStart, srcLength);
1405
1406 // get the characters from src
1407 // and replace the range in ourselves with them
1408 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1409 }
1410
1411 UnicodeString&
doReplace(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength)1412 UnicodeString::doReplace(int32_t start,
1413 int32_t length,
1414 const UChar *srcChars,
1415 int32_t srcStart,
1416 int32_t srcLength)
1417 {
1418 if(!isWritable()) {
1419 return *this;
1420 }
1421
1422 int32_t oldLength = this->length();
1423
1424 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1425 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1426 if(start == 0) {
1427 // remove prefix by adjusting the array pointer
1428 pinIndex(length);
1429 fUnion.fFields.fArray += length;
1430 fUnion.fFields.fCapacity -= length;
1431 setLength(oldLength - length);
1432 return *this;
1433 } else {
1434 pinIndex(start);
1435 if(length >= (oldLength - start)) {
1436 // remove suffix by reducing the length (like truncate())
1437 setLength(start);
1438 fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1439 return *this;
1440 }
1441 }
1442 }
1443
1444 if(start == oldLength) {
1445 return doAppend(srcChars, srcStart, srcLength);
1446 }
1447
1448 if(srcChars == 0) {
1449 srcLength = 0;
1450 } else {
1451 // Perform all remaining operations relative to srcChars + srcStart.
1452 // From this point forward, do not use srcStart.
1453 srcChars += srcStart;
1454 if (srcLength < 0) {
1455 // get the srcLength if necessary
1456 srcLength = u_strlen(srcChars);
1457 }
1458 }
1459
1460 // pin the indices to legal values
1461 pinIndices(start, length);
1462
1463 // Calculate the size of the string after the replace.
1464 // Avoid int32_t overflow.
1465 int32_t newLength = oldLength - length;
1466 if(srcLength > (INT32_MAX - newLength)) {
1467 setToBogus();
1468 return *this;
1469 }
1470 newLength += srcLength;
1471
1472 // Check for insertion into ourself
1473 const UChar *oldArray = getArrayStart();
1474 if (isBufferWritable() &&
1475 oldArray < srcChars + srcLength &&
1476 srcChars < oldArray + oldLength) {
1477 // Copy into a new UnicodeString and start over
1478 UnicodeString copy(srcChars, srcLength);
1479 if (copy.isBogus()) {
1480 setToBogus();
1481 return *this;
1482 }
1483 return doReplace(start, length, copy.getArrayStart(), 0, srcLength);
1484 }
1485
1486 // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1487 // therefore we need to keep the current fArray
1488 UChar oldStackBuffer[US_STACKBUF_SIZE];
1489 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1490 // copy the stack buffer contents because it will be overwritten with
1491 // fUnion.fFields values
1492 u_memcpy(oldStackBuffer, oldArray, oldLength);
1493 oldArray = oldStackBuffer;
1494 }
1495
1496 // clone our array and allocate a bigger array if needed
1497 int32_t *bufferToDelete = 0;
1498 if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1499 FALSE, &bufferToDelete)
1500 ) {
1501 return *this;
1502 }
1503
1504 // now do the replace
1505
1506 UChar *newArray = getArrayStart();
1507 if(newArray != oldArray) {
1508 // if fArray changed, then we need to copy everything except what will change
1509 us_arrayCopy(oldArray, 0, newArray, 0, start);
1510 us_arrayCopy(oldArray, start + length,
1511 newArray, start + srcLength,
1512 oldLength - (start + length));
1513 } else if(length != srcLength) {
1514 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1515 us_arrayCopy(oldArray, start + length,
1516 newArray, start + srcLength,
1517 oldLength - (start + length));
1518 }
1519
1520 // now fill in the hole with the new string
1521 us_arrayCopy(srcChars, 0, newArray, start, srcLength);
1522
1523 setLength(newLength);
1524
1525 // delayed delete in case srcChars == fArray when we started, and
1526 // to keep oldArray alive for the above operations
1527 if (bufferToDelete) {
1528 uprv_free(bufferToDelete);
1529 }
1530
1531 return *this;
1532 }
1533
1534 // Versions of doReplace() only for append() variants.
1535 // doReplace() and doAppend() optimize for different cases.
1536
1537 UnicodeString&
doAppend(const UnicodeString & src,int32_t srcStart,int32_t srcLength)1538 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1539 if(srcLength == 0) {
1540 return *this;
1541 }
1542
1543 // pin the indices to legal values
1544 src.pinIndices(srcStart, srcLength);
1545 return doAppend(src.getArrayStart(), srcStart, srcLength);
1546 }
1547
1548 UnicodeString&
doAppend(const UChar * srcChars,int32_t srcStart,int32_t srcLength)1549 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1550 if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1551 return *this;
1552 }
1553
1554 // Perform all remaining operations relative to srcChars + srcStart.
1555 // From this point forward, do not use srcStart.
1556 srcChars += srcStart;
1557
1558 if(srcLength < 0) {
1559 // get the srcLength if necessary
1560 if((srcLength = u_strlen(srcChars)) == 0) {
1561 return *this;
1562 }
1563 }
1564
1565 int32_t oldLength = length();
1566 int32_t newLength;
1567 if (uprv_add32_overflow(oldLength, srcLength, &newLength)) {
1568 setToBogus();
1569 return *this;
1570 }
1571
1572 // Check for append onto ourself
1573 const UChar* oldArray = getArrayStart();
1574 if (isBufferWritable() &&
1575 oldArray < srcChars + srcLength &&
1576 srcChars < oldArray + oldLength) {
1577 // Copy into a new UnicodeString and start over
1578 UnicodeString copy(srcChars, srcLength);
1579 if (copy.isBogus()) {
1580 setToBogus();
1581 return *this;
1582 }
1583 return doAppend(copy.getArrayStart(), 0, srcLength);
1584 }
1585
1586 // optimize append() onto a large-enough, owned string
1587 if((newLength <= getCapacity() && isBufferWritable()) ||
1588 cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1589 UChar *newArray = getArrayStart();
1590 // Do not copy characters when
1591 // UChar *buffer=str.getAppendBuffer(...);
1592 // is followed by
1593 // str.append(buffer, length);
1594 // or
1595 // str.appendString(buffer, length)
1596 // or similar.
1597 if(srcChars != newArray + oldLength) {
1598 us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength);
1599 }
1600 setLength(newLength);
1601 }
1602 return *this;
1603 }
1604
1605 /**
1606 * Replaceable API
1607 */
1608 void
handleReplaceBetween(int32_t start,int32_t limit,const UnicodeString & text)1609 UnicodeString::handleReplaceBetween(int32_t start,
1610 int32_t limit,
1611 const UnicodeString& text) {
1612 replaceBetween(start, limit, text);
1613 }
1614
1615 /**
1616 * Replaceable API
1617 */
1618 void
copy(int32_t start,int32_t limit,int32_t dest)1619 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1620 if (limit <= start) {
1621 return; // Nothing to do; avoid bogus malloc call
1622 }
1623 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1624 // Check to make sure text is not null.
1625 if (text != NULL) {
1626 extractBetween(start, limit, text, 0);
1627 insert(dest, text, 0, limit - start);
1628 uprv_free(text);
1629 }
1630 }
1631
1632 /**
1633 * Replaceable API
1634 *
1635 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1636 * so we implement this function here.
1637 */
hasMetaData() const1638 UBool Replaceable::hasMetaData() const {
1639 return TRUE;
1640 }
1641
1642 /**
1643 * Replaceable API
1644 */
hasMetaData() const1645 UBool UnicodeString::hasMetaData() const {
1646 return FALSE;
1647 }
1648
1649 UnicodeString&
doReverse(int32_t start,int32_t length)1650 UnicodeString::doReverse(int32_t start, int32_t length) {
1651 if(length <= 1 || !cloneArrayIfNeeded()) {
1652 return *this;
1653 }
1654
1655 // pin the indices to legal values
1656 pinIndices(start, length);
1657 if(length <= 1) { // pinIndices() might have shrunk the length
1658 return *this;
1659 }
1660
1661 UChar *left = getArrayStart() + start;
1662 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1663 UChar swap;
1664 UBool hasSupplementary = FALSE;
1665
1666 // Before the loop we know left<right because length>=2.
1667 do {
1668 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1669 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1670 *right-- = swap;
1671 } while(left < right);
1672 // Make sure to test the middle code unit of an odd-length string.
1673 // Redundant if the length is even.
1674 hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1675
1676 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1677 if(hasSupplementary) {
1678 UChar swap2;
1679
1680 left = getArrayStart() + start;
1681 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1682 while(left < right) {
1683 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1684 *left++ = swap2;
1685 *left++ = swap;
1686 } else {
1687 ++left;
1688 }
1689 }
1690 }
1691
1692 return *this;
1693 }
1694
1695 UBool
padLeading(int32_t targetLength,UChar padChar)1696 UnicodeString::padLeading(int32_t targetLength,
1697 UChar padChar)
1698 {
1699 int32_t oldLength = length();
1700 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1701 return FALSE;
1702 } else {
1703 // move contents up by padding width
1704 UChar *array = getArrayStart();
1705 int32_t start = targetLength - oldLength;
1706 us_arrayCopy(array, 0, array, start, oldLength);
1707
1708 // fill in padding character
1709 while(--start >= 0) {
1710 array[start] = padChar;
1711 }
1712 setLength(targetLength);
1713 return TRUE;
1714 }
1715 }
1716
1717 UBool
padTrailing(int32_t targetLength,UChar padChar)1718 UnicodeString::padTrailing(int32_t targetLength,
1719 UChar padChar)
1720 {
1721 int32_t oldLength = length();
1722 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1723 return FALSE;
1724 } else {
1725 // fill in padding character
1726 UChar *array = getArrayStart();
1727 int32_t length = targetLength;
1728 while(--length >= oldLength) {
1729 array[length] = padChar;
1730 }
1731 setLength(targetLength);
1732 return TRUE;
1733 }
1734 }
1735
1736 //========================================
1737 // Hashing
1738 //========================================
1739 int32_t
doHashCode() const1740 UnicodeString::doHashCode() const
1741 {
1742 /* Delegate hash computation to uhash. This makes UnicodeString
1743 * hashing consistent with UChar* hashing. */
1744 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1745 if (hashCode == kInvalidHashCode) {
1746 hashCode = kEmptyHashCode;
1747 }
1748 return hashCode;
1749 }
1750
1751 //========================================
1752 // External Buffer
1753 //========================================
1754
1755 char16_t *
getBuffer(int32_t minCapacity)1756 UnicodeString::getBuffer(int32_t minCapacity) {
1757 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1758 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1759 setZeroLength();
1760 return getArrayStart();
1761 } else {
1762 return nullptr;
1763 }
1764 }
1765
1766 void
releaseBuffer(int32_t newLength)1767 UnicodeString::releaseBuffer(int32_t newLength) {
1768 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1769 // set the new fLength
1770 int32_t capacity=getCapacity();
1771 if(newLength==-1) {
1772 // the new length is the string length, capped by fCapacity
1773 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1774 while(p<limit && *p!=0) {
1775 ++p;
1776 }
1777 newLength=(int32_t)(p-array);
1778 } else if(newLength>capacity) {
1779 newLength=capacity;
1780 }
1781 setLength(newLength);
1782 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1783 }
1784 }
1785
1786 //========================================
1787 // Miscellaneous
1788 //========================================
1789 UBool
cloneArrayIfNeeded(int32_t newCapacity,int32_t growCapacity,UBool doCopyArray,int32_t ** pBufferToDelete,UBool forceClone)1790 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1791 int32_t growCapacity,
1792 UBool doCopyArray,
1793 int32_t **pBufferToDelete,
1794 UBool forceClone) {
1795 // default parameters need to be static, therefore
1796 // the defaults are -1 to have convenience defaults
1797 if(newCapacity == -1) {
1798 newCapacity = getCapacity();
1799 }
1800
1801 // while a getBuffer(minCapacity) is "open",
1802 // prevent any modifications of the string by returning FALSE here
1803 // if the string is bogus, then only an assignment or similar can revive it
1804 if(!isWritable()) {
1805 return FALSE;
1806 }
1807
1808 /*
1809 * We need to make a copy of the array if
1810 * the buffer is read-only, or
1811 * the buffer is refCounted (shared), and refCount>1, or
1812 * the buffer is too small.
1813 * Return FALSE if memory could not be allocated.
1814 */
1815 if(forceClone ||
1816 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1817 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1818 newCapacity > getCapacity()
1819 ) {
1820 // check growCapacity for default value and use of the stack buffer
1821 if(growCapacity < 0) {
1822 growCapacity = newCapacity;
1823 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1824 growCapacity = US_STACKBUF_SIZE;
1825 }
1826
1827 // save old values
1828 UChar oldStackBuffer[US_STACKBUF_SIZE];
1829 UChar *oldArray;
1830 int32_t oldLength = length();
1831 int16_t flags = fUnion.fFields.fLengthAndFlags;
1832
1833 if(flags&kUsingStackBuffer) {
1834 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1835 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1836 // copy the stack buffer contents because it will be overwritten with
1837 // fUnion.fFields values
1838 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1839 oldArray = oldStackBuffer;
1840 } else {
1841 oldArray = NULL; // no need to copy from the stack buffer to itself
1842 }
1843 } else {
1844 oldArray = fUnion.fFields.fArray;
1845 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1846 }
1847
1848 // allocate a new array
1849 if(allocate(growCapacity) ||
1850 (newCapacity < growCapacity && allocate(newCapacity))
1851 ) {
1852 if(doCopyArray) {
1853 // copy the contents
1854 // do not copy more than what fits - it may be smaller than before
1855 int32_t minLength = oldLength;
1856 newCapacity = getCapacity();
1857 if(newCapacity < minLength) {
1858 minLength = newCapacity;
1859 }
1860 if(oldArray != NULL) {
1861 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1862 }
1863 setLength(minLength);
1864 } else {
1865 setZeroLength();
1866 }
1867
1868 // release the old array
1869 if(flags & kRefCounted) {
1870 // the array is refCounted; decrement and release if 0
1871 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1872 if(umtx_atomic_dec(pRefCount) == 0) {
1873 if(pBufferToDelete == 0) {
1874 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1875 // is defined as volatile. (Volatile has useful non-standard behavior
1876 // with this compiler.)
1877 uprv_free((void *)pRefCount);
1878 } else {
1879 // the caller requested to delete it himself
1880 *pBufferToDelete = (int32_t *)pRefCount;
1881 }
1882 }
1883 }
1884 } else {
1885 // not enough memory for growCapacity and not even for the smaller newCapacity
1886 // reset the old values for setToBogus() to release the array
1887 if(!(flags&kUsingStackBuffer)) {
1888 fUnion.fFields.fArray = oldArray;
1889 }
1890 fUnion.fFields.fLengthAndFlags = flags;
1891 setToBogus();
1892 return FALSE;
1893 }
1894 }
1895 return TRUE;
1896 }
1897
1898 // UnicodeStringAppendable ------------------------------------------------- ***
1899
~UnicodeStringAppendable()1900 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1901
1902 UBool
appendCodeUnit(UChar c)1903 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1904 return str.doAppend(&c, 0, 1).isWritable();
1905 }
1906
1907 UBool
appendCodePoint(UChar32 c)1908 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1909 UChar buffer[U16_MAX_LENGTH];
1910 int32_t cLength = 0;
1911 UBool isError = FALSE;
1912 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1913 return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1914 }
1915
1916 UBool
appendString(const UChar * s,int32_t length)1917 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1918 return str.doAppend(s, 0, length).isWritable();
1919 }
1920
1921 UBool
reserveAppendCapacity(int32_t appendCapacity)1922 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1923 return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1924 }
1925
1926 UChar *
getAppendBuffer(int32_t minCapacity,int32_t desiredCapacityHint,UChar * scratch,int32_t scratchCapacity,int32_t * resultCapacity)1927 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1928 int32_t desiredCapacityHint,
1929 UChar *scratch, int32_t scratchCapacity,
1930 int32_t *resultCapacity) {
1931 if(minCapacity < 1 || scratchCapacity < minCapacity) {
1932 *resultCapacity = 0;
1933 return NULL;
1934 }
1935 int32_t oldLength = str.length();
1936 if(minCapacity <= (kMaxCapacity - oldLength) &&
1937 desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1938 str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1939 *resultCapacity = str.getCapacity() - oldLength;
1940 return str.getArrayStart() + oldLength;
1941 }
1942 *resultCapacity = scratchCapacity;
1943 return scratch;
1944 }
1945
1946 U_NAMESPACE_END
1947
1948 U_NAMESPACE_USE
1949
1950 U_CAPI int32_t U_EXPORT2
uhash_hashUnicodeString(const UElement key)1951 uhash_hashUnicodeString(const UElement key) {
1952 const UnicodeString *str = (const UnicodeString*) key.pointer;
1953 return (str == NULL) ? 0 : str->hashCode();
1954 }
1955
1956 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1957 // does not depend on hashtable code.
1958 U_CAPI UBool U_EXPORT2
uhash_compareUnicodeString(const UElement key1,const UElement key2)1959 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1960 const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1961 const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1962 if (str1 == str2) {
1963 return TRUE;
1964 }
1965 if (str1 == NULL || str2 == NULL) {
1966 return FALSE;
1967 }
1968 return *str1 == *str2;
1969 }
1970
1971 #ifdef U_STATIC_IMPLEMENTATION
1972 /*
1973 This should never be called. It is defined here to make sure that the
1974 virtual vector deleting destructor is defined within unistr.cpp.
1975 The vector deleting destructor is already a part of UObject,
1976 but defining it here makes sure that it is included with this object file.
1977 This makes sure that static library dependencies are kept to a minimum.
1978 */
uprv_UnicodeStringDummy(void)1979 static void uprv_UnicodeStringDummy(void) {
1980 delete [] (new UnicodeString[2]);
1981 }
1982 #endif
1983