1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ******************************************************************************
8 *
9 * File unistr.cpp
10 *
11 * Modification History:
12 *
13 * Date Name Description
14 * 09/25/98 stephen Creation.
15 * 04/20/99 stephen Overhauled per 4/16 code review.
16 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
17 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
18 * Replaceable.
19 * 06/25/01 grhoten Removed the dependency on iostream
20 ******************************************************************************
21 */
22
23 #include "unicode/utypes.h"
24 #include "unicode/appendable.h"
25 #include "unicode/putil.h"
26 #include "cstring.h"
27 #include "cmemory.h"
28 #include "unicode/ustring.h"
29 #include "unicode/unistr.h"
30 #include "unicode/utf.h"
31 #include "unicode/utf16.h"
32 #include "uelement.h"
33 #include "ustr_imp.h"
34 #include "umutex.h"
35 #include "uassert.h"
36
37 #if 0
38
39 #include <iostream>
40 using namespace std;
41
42 //DEBUGGING
43 void
44 print(const UnicodeString& s,
45 const char *name)
46 {
47 UChar c;
48 cout << name << ":|";
49 for(int i = 0; i < s.length(); ++i) {
50 c = s[i];
51 if(c>= 0x007E || c < 0x0020)
52 cout << "[0x" << hex << s[i] << "]";
53 else
54 cout << (char) s[i];
55 }
56 cout << '|' << endl;
57 }
58
59 void
60 print(const UChar *s,
61 int32_t len,
62 const char *name)
63 {
64 UChar c;
65 cout << name << ":|";
66 for(int i = 0; i < len; ++i) {
67 c = s[i];
68 if(c>= 0x007E || c < 0x0020)
69 cout << "[0x" << hex << s[i] << "]";
70 else
71 cout << (char) s[i];
72 }
73 cout << '|' << endl;
74 }
75 // END DEBUGGING
76 #endif
77
78 // Local function definitions for now
79
80 // need to copy areas that may overlap
81 static
82 inline void
us_arrayCopy(const UChar * src,int32_t srcStart,UChar * dst,int32_t dstStart,int32_t count)83 us_arrayCopy(const UChar *src, int32_t srcStart,
84 UChar *dst, int32_t dstStart, int32_t count)
85 {
86 if(count>0) {
87 uprv_memmove(dst+dstStart, src+srcStart, (size_t)count*sizeof(*src));
88 }
89 }
90
91 // u_unescapeAt() callback to get a UChar from a UnicodeString
92 U_CDECL_BEGIN
93 static UChar U_CALLCONV
UnicodeString_charAt(int32_t offset,void * context)94 UnicodeString_charAt(int32_t offset, void *context) {
95 return ((icu::UnicodeString*) context)->charAt(offset);
96 }
97 U_CDECL_END
98
99 U_NAMESPACE_BEGIN
100
101 /* The Replaceable virtual destructor can't be defined in the header
102 due to how AIX works with multiple definitions of virtual functions.
103 */
~Replaceable()104 Replaceable::~Replaceable() {}
105
106 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
107
108 UnicodeString U_EXPORT2
109 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
110 return
111 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
112 append(s1).
113 append(s2);
114 }
115
116 //========================================
117 // Reference Counting functions, put at top of file so that optimizing compilers
118 // have a chance to automatically inline.
119 //========================================
120
121 void
addRef()122 UnicodeString::addRef() {
123 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
124 }
125
126 int32_t
removeRef()127 UnicodeString::removeRef() {
128 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
129 }
130
131 int32_t
refCount() const132 UnicodeString::refCount() const {
133 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
134 }
135
136 void
releaseArray()137 UnicodeString::releaseArray() {
138 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
139 uprv_free((int32_t *)fUnion.fFields.fArray - 1);
140 }
141 }
142
143
144
145 //========================================
146 // Constructors
147 //========================================
148
149 // The default constructor is inline in unistr.h.
150
UnicodeString(int32_t capacity,UChar32 c,int32_t count)151 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
152 fUnion.fFields.fLengthAndFlags = 0;
153 if(count <= 0 || (uint32_t)c > 0x10ffff) {
154 // just allocate and do not do anything else
155 allocate(capacity);
156 } else if(c <= 0xffff) {
157 int32_t length = count;
158 if(capacity < length) {
159 capacity = length;
160 }
161 if(allocate(capacity)) {
162 UChar *array = getArrayStart();
163 UChar unit = (UChar)c;
164 for(int32_t i = 0; i < length; ++i) {
165 array[i] = unit;
166 }
167 setLength(length);
168 }
169 } else { // supplementary code point, write surrogate pairs
170 if(count > (INT32_MAX / 2)) {
171 // We would get more than 2G UChars.
172 allocate(capacity);
173 return;
174 }
175 int32_t length = count * 2;
176 if(capacity < length) {
177 capacity = length;
178 }
179 if(allocate(capacity)) {
180 UChar *array = getArrayStart();
181 UChar lead = U16_LEAD(c);
182 UChar trail = U16_TRAIL(c);
183 for(int32_t i = 0; i < length; i += 2) {
184 array[i] = lead;
185 array[i + 1] = trail;
186 }
187 setLength(length);
188 }
189 }
190 }
191
UnicodeString(UChar ch)192 UnicodeString::UnicodeString(UChar ch) {
193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194 fUnion.fStackFields.fBuffer[0] = ch;
195 }
196
UnicodeString(UChar32 ch)197 UnicodeString::UnicodeString(UChar32 ch) {
198 fUnion.fFields.fLengthAndFlags = kShortString;
199 int32_t i = 0;
200 UBool isError = FALSE;
201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202 // We test isError so that the compiler does not complain that we don't.
203 // If isError then i==0 which is what we want anyway.
204 if(!isError) {
205 setShortLength(i);
206 }
207 }
208
UnicodeString(const UChar * text)209 UnicodeString::UnicodeString(const UChar *text) {
210 fUnion.fFields.fLengthAndFlags = kShortString;
211 doAppend(text, 0, -1);
212 }
213
UnicodeString(const UChar * text,int32_t textLength)214 UnicodeString::UnicodeString(const UChar *text,
215 int32_t textLength) {
216 fUnion.fFields.fLengthAndFlags = kShortString;
217 doAppend(text, 0, textLength);
218 }
219
UnicodeString(UBool isTerminated,ConstChar16Ptr textPtr,int32_t textLength)220 UnicodeString::UnicodeString(UBool isTerminated,
221 ConstChar16Ptr textPtr,
222 int32_t textLength) {
223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224 const UChar *text = textPtr;
225 if(text == NULL) {
226 // treat as an empty string, do not alias
227 setToEmpty();
228 } else if(textLength < -1 ||
229 (textLength == -1 && !isTerminated) ||
230 (textLength >= 0 && isTerminated && text[textLength] != 0)
231 ) {
232 setToBogus();
233 } else {
234 if(textLength == -1) {
235 // text is terminated, or else it would have failed the above test
236 textLength = u_strlen(text);
237 }
238 setArray(const_cast<UChar *>(text), textLength,
239 isTerminated ? textLength + 1 : textLength);
240 }
241 }
242
UnicodeString(UChar * buff,int32_t buffLength,int32_t buffCapacity)243 UnicodeString::UnicodeString(UChar *buff,
244 int32_t buffLength,
245 int32_t buffCapacity) {
246 fUnion.fFields.fLengthAndFlags = kWritableAlias;
247 if(buff == NULL) {
248 // treat as an empty string, do not alias
249 setToEmpty();
250 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
251 setToBogus();
252 } else {
253 if(buffLength == -1) {
254 // fLength = u_strlen(buff); but do not look beyond buffCapacity
255 const UChar *p = buff, *limit = buff + buffCapacity;
256 while(p != limit && *p != 0) {
257 ++p;
258 }
259 buffLength = (int32_t)(p - buff);
260 }
261 setArray(buff, buffLength, buffCapacity);
262 }
263 }
264
UnicodeString(const char * src,int32_t length,EInvariant)265 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
266 fUnion.fFields.fLengthAndFlags = kShortString;
267 if(src==NULL) {
268 // treat as an empty string
269 } else {
270 if(length<0) {
271 length=(int32_t)uprv_strlen(src);
272 }
273 if(cloneArrayIfNeeded(length, length, FALSE)) {
274 u_charsToUChars(src, getArrayStart(), length);
275 setLength(length);
276 } else {
277 setToBogus();
278 }
279 }
280 }
281
282 #if U_CHARSET_IS_UTF8
283
UnicodeString(const char * codepageData)284 UnicodeString::UnicodeString(const char *codepageData) {
285 fUnion.fFields.fLengthAndFlags = kShortString;
286 if(codepageData != 0) {
287 setToUTF8(codepageData);
288 }
289 }
290
UnicodeString(const char * codepageData,int32_t dataLength)291 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
292 fUnion.fFields.fLengthAndFlags = kShortString;
293 // if there's nothing to convert, do nothing
294 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
295 return;
296 }
297 if(dataLength == -1) {
298 dataLength = (int32_t)uprv_strlen(codepageData);
299 }
300 setToUTF8(StringPiece(codepageData, dataLength));
301 }
302
303 // else see unistr_cnv.cpp
304 #endif
305
UnicodeString(const UnicodeString & that)306 UnicodeString::UnicodeString(const UnicodeString& that) {
307 fUnion.fFields.fLengthAndFlags = kShortString;
308 copyFrom(that);
309 }
310
UnicodeString(UnicodeString && src)311 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
312 fUnion.fFields.fLengthAndFlags = kShortString;
313 moveFrom(src);
314 }
315
UnicodeString(const UnicodeString & that,int32_t srcStart)316 UnicodeString::UnicodeString(const UnicodeString& that,
317 int32_t srcStart) {
318 fUnion.fFields.fLengthAndFlags = kShortString;
319 setTo(that, srcStart);
320 }
321
UnicodeString(const UnicodeString & that,int32_t srcStart,int32_t srcLength)322 UnicodeString::UnicodeString(const UnicodeString& that,
323 int32_t srcStart,
324 int32_t srcLength) {
325 fUnion.fFields.fLengthAndFlags = kShortString;
326 setTo(that, srcStart, srcLength);
327 }
328
329 // Replaceable base class clone() default implementation, does not clone
330 Replaceable *
clone() const331 Replaceable::clone() const {
332 return NULL;
333 }
334
335 // UnicodeString overrides clone() with a real implementation
336 Replaceable *
clone() const337 UnicodeString::clone() const {
338 return new UnicodeString(*this);
339 }
340
341 //========================================
342 // array allocation
343 //========================================
344
345 namespace {
346
347 const int32_t kGrowSize = 128;
348
349 // The number of bytes for one int32_t reference counter and capacity UChars
350 // must fit into a 32-bit size_t (at least when on a 32-bit platform).
351 // We also add one for the NUL terminator, to avoid reallocation in getTerminatedBuffer(),
352 // and round up to a multiple of 16 bytes.
353 // This means that capacity must be at most (0xfffffff0 - 4) / 2 - 1 = 0x7ffffff5.
354 // (With more complicated checks we could go up to 0x7ffffffd without rounding up,
355 // but that does not seem worth it.)
356 const int32_t kMaxCapacity = 0x7ffffff5;
357
getGrowCapacity(int32_t newLength)358 int32_t getGrowCapacity(int32_t newLength) {
359 int32_t growSize = (newLength >> 2) + kGrowSize;
360 if(growSize <= (kMaxCapacity - newLength)) {
361 return newLength + growSize;
362 } else {
363 return kMaxCapacity;
364 }
365 }
366
367 } // namespace
368
369 UBool
allocate(int32_t capacity)370 UnicodeString::allocate(int32_t capacity) {
371 if(capacity <= US_STACKBUF_SIZE) {
372 fUnion.fFields.fLengthAndFlags = kShortString;
373 return TRUE;
374 }
375 if(capacity <= kMaxCapacity) {
376 ++capacity; // for the NUL
377 // Switch to size_t which is unsigned so that we can allocate up to 4GB.
378 // Reference counter + UChars.
379 size_t numBytes = sizeof(int32_t) + (size_t)capacity * U_SIZEOF_UCHAR;
380 // Round up to a multiple of 16.
381 numBytes = (numBytes + 15) & ~15;
382 int32_t *array = (int32_t *) uprv_malloc(numBytes);
383 if(array != NULL) {
384 // set initial refCount and point behind the refCount
385 *array++ = 1;
386 numBytes -= sizeof(int32_t);
387
388 // have fArray point to the first UChar
389 fUnion.fFields.fArray = (UChar *)array;
390 fUnion.fFields.fCapacity = (int32_t)(numBytes / U_SIZEOF_UCHAR);
391 fUnion.fFields.fLengthAndFlags = kLongString;
392 return TRUE;
393 }
394 }
395 fUnion.fFields.fLengthAndFlags = kIsBogus;
396 fUnion.fFields.fArray = 0;
397 fUnion.fFields.fCapacity = 0;
398 return FALSE;
399 }
400
401 //========================================
402 // Destructor
403 //========================================
404
405 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
406 static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
407 static u_atomic_int32_t beyondCount(0);
408
unistr_printLengths()409 U_CAPI void unistr_printLengths() {
410 int32_t i;
411 for(i = 0; i <= 59; ++i) {
412 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
413 }
414 int32_t beyond = beyondCount;
415 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
416 beyond += finalLengthCounts[i];
417 }
418 printf(">59, %9d\n", beyond);
419 }
420 #endif
421
~UnicodeString()422 UnicodeString::~UnicodeString()
423 {
424 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
425 // Count lengths of strings at the end of their lifetime.
426 // Useful for discussion of a desirable stack buffer size.
427 // Count the contents length, not the optional NUL terminator nor further capacity.
428 // Ignore open-buffer strings and strings which alias external storage.
429 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
430 if(hasShortLength()) {
431 umtx_atomic_inc(finalLengthCounts + getShortLength());
432 } else {
433 umtx_atomic_inc(&beyondCount);
434 }
435 }
436 #endif
437
438 releaseArray();
439 }
440
441 //========================================
442 // Factory methods
443 //========================================
444
fromUTF8(StringPiece utf8)445 UnicodeString UnicodeString::fromUTF8(StringPiece utf8) {
446 UnicodeString result;
447 result.setToUTF8(utf8);
448 return result;
449 }
450
fromUTF32(const UChar32 * utf32,int32_t length)451 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
452 UnicodeString result;
453 int32_t capacity;
454 // Most UTF-32 strings will be BMP-only and result in a same-length
455 // UTF-16 string. We overestimate the capacity just slightly,
456 // just in case there are a few supplementary characters.
457 if(length <= US_STACKBUF_SIZE) {
458 capacity = US_STACKBUF_SIZE;
459 } else {
460 capacity = length + (length >> 4) + 4;
461 }
462 do {
463 UChar *utf16 = result.getBuffer(capacity);
464 int32_t length16;
465 UErrorCode errorCode = U_ZERO_ERROR;
466 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
467 utf32, length,
468 0xfffd, // Substitution character.
469 NULL, // Don't care about number of substitutions.
470 &errorCode);
471 result.releaseBuffer(length16);
472 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
473 capacity = length16 + 1; // +1 for the terminating NUL.
474 continue;
475 } else if(U_FAILURE(errorCode)) {
476 result.setToBogus();
477 }
478 break;
479 } while(TRUE);
480 return result;
481 }
482
483 //========================================
484 // Assignment
485 //========================================
486
487 UnicodeString &
operator =(const UnicodeString & src)488 UnicodeString::operator=(const UnicodeString &src) {
489 return copyFrom(src);
490 }
491
492 UnicodeString &
fastCopyFrom(const UnicodeString & src)493 UnicodeString::fastCopyFrom(const UnicodeString &src) {
494 return copyFrom(src, TRUE);
495 }
496
497 UnicodeString &
copyFrom(const UnicodeString & src,UBool fastCopy)498 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
499 // if assigning to ourselves, do nothing
500 if(this == &src) {
501 return *this;
502 }
503
504 // is the right side bogus?
505 if(src.isBogus()) {
506 setToBogus();
507 return *this;
508 }
509
510 // delete the current contents
511 releaseArray();
512
513 if(src.isEmpty()) {
514 // empty string - use the stack buffer
515 setToEmpty();
516 return *this;
517 }
518
519 // fLength>0 and not an "open" src.getBuffer(minCapacity)
520 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
521 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
522 case kShortString:
523 // short string using the stack buffer, do the same
524 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
525 getShortLength() * U_SIZEOF_UCHAR);
526 break;
527 case kLongString:
528 // src uses a refCounted string buffer, use that buffer with refCount
529 // src is const, use a cast - we don't actually change it
530 ((UnicodeString &)src).addRef();
531 // copy all fields, share the reference-counted buffer
532 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
533 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
534 if(!hasShortLength()) {
535 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
536 }
537 break;
538 case kReadonlyAlias:
539 if(fastCopy) {
540 // src is a readonly alias, do the same
541 // -> maintain the readonly alias as such
542 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
543 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
544 if(!hasShortLength()) {
545 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
546 }
547 break;
548 }
549 // else if(!fastCopy) fall through to case kWritableAlias
550 // -> allocate a new buffer and copy the contents
551 U_FALLTHROUGH;
552 case kWritableAlias: {
553 // src is a writable alias; we make a copy of that instead
554 int32_t srcLength = src.length();
555 if(allocate(srcLength)) {
556 u_memcpy(getArrayStart(), src.getArrayStart(), srcLength);
557 setLength(srcLength);
558 break;
559 }
560 // if there is not enough memory, then fall through to setting to bogus
561 U_FALLTHROUGH;
562 }
563 default:
564 // if src is bogus, set ourselves to bogus
565 // do not call setToBogus() here because fArray and flags are not consistent here
566 fUnion.fFields.fLengthAndFlags = kIsBogus;
567 fUnion.fFields.fArray = 0;
568 fUnion.fFields.fCapacity = 0;
569 break;
570 }
571
572 return *this;
573 }
574
moveFrom(UnicodeString & src)575 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT {
576 // No explicit check for self move assignment, consistent with standard library.
577 // Self move assignment causes no crash nor leak but might make the object bogus.
578 releaseArray();
579 copyFieldsFrom(src, TRUE);
580 return *this;
581 }
582
583 // Same as moveFrom() except without memory management.
copyFieldsFrom(UnicodeString & src,UBool setSrcToBogus)584 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
585 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
586 if(lengthAndFlags & kUsingStackBuffer) {
587 // Short string using the stack buffer, copy the contents.
588 // Check for self assignment to prevent "overlap in memcpy" warnings,
589 // although it should be harmless to copy a buffer to itself exactly.
590 if(this != &src) {
591 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
592 getShortLength() * U_SIZEOF_UCHAR);
593 }
594 } else {
595 // In all other cases, copy all fields.
596 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
597 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
598 if(!hasShortLength()) {
599 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
600 }
601 if(setSrcToBogus) {
602 // Set src to bogus without releasing any memory.
603 src.fUnion.fFields.fLengthAndFlags = kIsBogus;
604 src.fUnion.fFields.fArray = NULL;
605 src.fUnion.fFields.fCapacity = 0;
606 }
607 }
608 }
609
swap(UnicodeString & other)610 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
611 UnicodeString temp; // Empty short string: Known not to need releaseArray().
612 // Copy fields without resetting source values in between.
613 temp.copyFieldsFrom(*this, FALSE);
614 this->copyFieldsFrom(other, FALSE);
615 other.copyFieldsFrom(temp, FALSE);
616 // Set temp to an empty string so that other's memory is not released twice.
617 temp.fUnion.fFields.fLengthAndFlags = kShortString;
618 }
619
620 //========================================
621 // Miscellaneous operations
622 //========================================
623
unescape() const624 UnicodeString UnicodeString::unescape() const {
625 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
626 if (result.isBogus()) {
627 return result;
628 }
629 const UChar *array = getBuffer();
630 int32_t len = length();
631 int32_t prev = 0;
632 for (int32_t i=0;;) {
633 if (i == len) {
634 result.append(array, prev, len - prev);
635 break;
636 }
637 if (array[i++] == 0x5C /*'\\'*/) {
638 result.append(array, prev, (i - 1) - prev);
639 UChar32 c = unescapeAt(i); // advances i
640 if (c < 0) {
641 result.remove(); // return empty string
642 break; // invalid escape sequence
643 }
644 result.append(c);
645 prev = i;
646 }
647 }
648 return result;
649 }
650
unescapeAt(int32_t & offset) const651 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
652 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
653 }
654
655 //========================================
656 // Read-only implementation
657 //========================================
658 UBool
doEquals(const UnicodeString & text,int32_t len) const659 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
660 // Requires: this & text not bogus and have same lengths.
661 // Byte-wise comparison works for equality regardless of endianness.
662 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
663 }
664
665 int8_t
doCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const666 UnicodeString::doCompare( int32_t start,
667 int32_t length,
668 const UChar *srcChars,
669 int32_t srcStart,
670 int32_t srcLength) const
671 {
672 // compare illegal string values
673 if(isBogus()) {
674 return -1;
675 }
676
677 // pin indices to legal values
678 pinIndices(start, length);
679
680 if(srcChars == NULL) {
681 // treat const UChar *srcChars==NULL as an empty string
682 return length == 0 ? 0 : 1;
683 }
684
685 // get the correct pointer
686 const UChar *chars = getArrayStart();
687
688 chars += start;
689 srcChars += srcStart;
690
691 int32_t minLength;
692 int8_t lengthResult;
693
694 // get the srcLength if necessary
695 if(srcLength < 0) {
696 srcLength = u_strlen(srcChars + srcStart);
697 }
698
699 // are we comparing different lengths?
700 if(length != srcLength) {
701 if(length < srcLength) {
702 minLength = length;
703 lengthResult = -1;
704 } else {
705 minLength = srcLength;
706 lengthResult = 1;
707 }
708 } else {
709 minLength = length;
710 lengthResult = 0;
711 }
712
713 /*
714 * note that uprv_memcmp() returns an int but we return an int8_t;
715 * we need to take care not to truncate the result -
716 * one way to do this is to right-shift the value to
717 * move the sign bit into the lower 8 bits and making sure that this
718 * does not become 0 itself
719 */
720
721 if(minLength > 0 && chars != srcChars) {
722 int32_t result;
723
724 # if U_IS_BIG_ENDIAN
725 // big-endian: byte comparison works
726 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
727 if(result != 0) {
728 return (int8_t)(result >> 15 | 1);
729 }
730 # else
731 // little-endian: compare UChar units
732 do {
733 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
734 if(result != 0) {
735 return (int8_t)(result >> 15 | 1);
736 }
737 } while(--minLength > 0);
738 # endif
739 }
740 return lengthResult;
741 }
742
743 /* String compare in code point order - doCompare() compares in code unit order. */
744 int8_t
doCompareCodePointOrder(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const745 UnicodeString::doCompareCodePointOrder(int32_t start,
746 int32_t length,
747 const UChar *srcChars,
748 int32_t srcStart,
749 int32_t srcLength) const
750 {
751 // compare illegal string values
752 // treat const UChar *srcChars==NULL as an empty string
753 if(isBogus()) {
754 return -1;
755 }
756
757 // pin indices to legal values
758 pinIndices(start, length);
759
760 if(srcChars == NULL) {
761 srcStart = srcLength = 0;
762 }
763
764 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
765 /* translate the 32-bit result into an 8-bit one */
766 if(diff!=0) {
767 return (int8_t)(diff >> 15 | 1);
768 } else {
769 return 0;
770 }
771 }
772
773 int32_t
getLength() const774 UnicodeString::getLength() const {
775 return length();
776 }
777
778 UChar
getCharAt(int32_t offset) const779 UnicodeString::getCharAt(int32_t offset) const {
780 return charAt(offset);
781 }
782
783 UChar32
getChar32At(int32_t offset) const784 UnicodeString::getChar32At(int32_t offset) const {
785 return char32At(offset);
786 }
787
788 UChar32
char32At(int32_t offset) const789 UnicodeString::char32At(int32_t offset) const
790 {
791 int32_t len = length();
792 if((uint32_t)offset < (uint32_t)len) {
793 const UChar *array = getArrayStart();
794 UChar32 c;
795 U16_GET(array, 0, offset, len, c);
796 return c;
797 } else {
798 return kInvalidUChar;
799 }
800 }
801
802 int32_t
getChar32Start(int32_t offset) const803 UnicodeString::getChar32Start(int32_t offset) const {
804 if((uint32_t)offset < (uint32_t)length()) {
805 const UChar *array = getArrayStart();
806 U16_SET_CP_START(array, 0, offset);
807 return offset;
808 } else {
809 return 0;
810 }
811 }
812
813 int32_t
getChar32Limit(int32_t offset) const814 UnicodeString::getChar32Limit(int32_t offset) const {
815 int32_t len = length();
816 if((uint32_t)offset < (uint32_t)len) {
817 const UChar *array = getArrayStart();
818 U16_SET_CP_LIMIT(array, 0, offset, len);
819 return offset;
820 } else {
821 return len;
822 }
823 }
824
825 int32_t
countChar32(int32_t start,int32_t length) const826 UnicodeString::countChar32(int32_t start, int32_t length) const {
827 pinIndices(start, length);
828 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
829 return u_countChar32(getArrayStart()+start, length);
830 }
831
832 UBool
hasMoreChar32Than(int32_t start,int32_t length,int32_t number) const833 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
834 pinIndices(start, length);
835 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
836 return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
837 }
838
839 int32_t
moveIndex32(int32_t index,int32_t delta) const840 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
841 // pin index
842 int32_t len = length();
843 if(index<0) {
844 index=0;
845 } else if(index>len) {
846 index=len;
847 }
848
849 const UChar *array = getArrayStart();
850 if(delta>0) {
851 U16_FWD_N(array, index, len, delta);
852 } else {
853 U16_BACK_N(array, 0, index, -delta);
854 }
855
856 return index;
857 }
858
859 void
doExtract(int32_t start,int32_t length,UChar * dst,int32_t dstStart) const860 UnicodeString::doExtract(int32_t start,
861 int32_t length,
862 UChar *dst,
863 int32_t dstStart) const
864 {
865 // pin indices to legal values
866 pinIndices(start, length);
867
868 // do not copy anything if we alias dst itself
869 const UChar *array = getArrayStart();
870 if(array + start != dst + dstStart) {
871 us_arrayCopy(array, start, dst, dstStart, length);
872 }
873 }
874
875 int32_t
extract(Char16Ptr dest,int32_t destCapacity,UErrorCode & errorCode) const876 UnicodeString::extract(Char16Ptr dest, int32_t destCapacity,
877 UErrorCode &errorCode) const {
878 int32_t len = length();
879 if(U_SUCCESS(errorCode)) {
880 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
881 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
882 } else {
883 const UChar *array = getArrayStart();
884 if(len>0 && len<=destCapacity && array!=dest) {
885 u_memcpy(dest, array, len);
886 }
887 return u_terminateUChars(dest, destCapacity, len, &errorCode);
888 }
889 }
890
891 return len;
892 }
893
894 int32_t
extract(int32_t start,int32_t length,char * target,int32_t targetCapacity,enum EInvariant) const895 UnicodeString::extract(int32_t start,
896 int32_t length,
897 char *target,
898 int32_t targetCapacity,
899 enum EInvariant) const
900 {
901 // if the arguments are illegal, then do nothing
902 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
903 return 0;
904 }
905
906 // pin the indices to legal values
907 pinIndices(start, length);
908
909 if(length <= targetCapacity) {
910 u_UCharsToChars(getArrayStart() + start, target, length);
911 }
912 UErrorCode status = U_ZERO_ERROR;
913 return u_terminateChars(target, targetCapacity, length, &status);
914 }
915
916 UnicodeString
tempSubString(int32_t start,int32_t len) const917 UnicodeString::tempSubString(int32_t start, int32_t len) const {
918 pinIndices(start, len);
919 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
920 if(array==NULL) {
921 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string
922 len=-2; // bogus result string
923 }
924 return UnicodeString(FALSE, array + start, len);
925 }
926
927 int32_t
toUTF8(int32_t start,int32_t len,char * target,int32_t capacity) const928 UnicodeString::toUTF8(int32_t start, int32_t len,
929 char *target, int32_t capacity) const {
930 pinIndices(start, len);
931 int32_t length8;
932 UErrorCode errorCode = U_ZERO_ERROR;
933 u_strToUTF8WithSub(target, capacity, &length8,
934 getBuffer() + start, len,
935 0xFFFD, // Standard substitution character.
936 NULL, // Don't care about number of substitutions.
937 &errorCode);
938 return length8;
939 }
940
941 #if U_CHARSET_IS_UTF8
942
943 int32_t
extract(int32_t start,int32_t len,char * target,uint32_t dstSize) const944 UnicodeString::extract(int32_t start, int32_t len,
945 char *target, uint32_t dstSize) const {
946 // if the arguments are illegal, then do nothing
947 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
948 return 0;
949 }
950 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
951 }
952
953 // else see unistr_cnv.cpp
954 #endif
955
956 void
extractBetween(int32_t start,int32_t limit,UnicodeString & target) const957 UnicodeString::extractBetween(int32_t start,
958 int32_t limit,
959 UnicodeString& target) const {
960 pinIndex(start);
961 pinIndex(limit);
962 doExtract(start, limit - start, target);
963 }
964
965 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
966 // as many bytes as the source has UChars.
967 // The "worst cases" are writing systems like Indic, Thai and CJK with
968 // 3:1 bytes:UChars.
969 void
toUTF8(ByteSink & sink) const970 UnicodeString::toUTF8(ByteSink &sink) const {
971 int32_t length16 = length();
972 if(length16 != 0) {
973 char stackBuffer[1024];
974 int32_t capacity = (int32_t)sizeof(stackBuffer);
975 UBool utf8IsOwned = FALSE;
976 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
977 3*length16,
978 stackBuffer, capacity,
979 &capacity);
980 int32_t length8 = 0;
981 UErrorCode errorCode = U_ZERO_ERROR;
982 u_strToUTF8WithSub(utf8, capacity, &length8,
983 getBuffer(), length16,
984 0xFFFD, // Standard substitution character.
985 NULL, // Don't care about number of substitutions.
986 &errorCode);
987 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
988 utf8 = (char *)uprv_malloc(length8);
989 if(utf8 != NULL) {
990 utf8IsOwned = TRUE;
991 errorCode = U_ZERO_ERROR;
992 u_strToUTF8WithSub(utf8, length8, &length8,
993 getBuffer(), length16,
994 0xFFFD, // Standard substitution character.
995 NULL, // Don't care about number of substitutions.
996 &errorCode);
997 } else {
998 errorCode = U_MEMORY_ALLOCATION_ERROR;
999 }
1000 }
1001 if(U_SUCCESS(errorCode)) {
1002 sink.Append(utf8, length8);
1003 sink.Flush();
1004 }
1005 if(utf8IsOwned) {
1006 uprv_free(utf8);
1007 }
1008 }
1009 }
1010
1011 int32_t
toUTF32(UChar32 * utf32,int32_t capacity,UErrorCode & errorCode) const1012 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
1013 int32_t length32=0;
1014 if(U_SUCCESS(errorCode)) {
1015 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
1016 u_strToUTF32WithSub(utf32, capacity, &length32,
1017 getBuffer(), length(),
1018 0xfffd, // Substitution character.
1019 NULL, // Don't care about number of substitutions.
1020 &errorCode);
1021 }
1022 return length32;
1023 }
1024
1025 int32_t
indexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const1026 UnicodeString::indexOf(const UChar *srcChars,
1027 int32_t srcStart,
1028 int32_t srcLength,
1029 int32_t start,
1030 int32_t length) const
1031 {
1032 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1033 return -1;
1034 }
1035
1036 // UnicodeString does not find empty substrings
1037 if(srcLength < 0 && srcChars[srcStart] == 0) {
1038 return -1;
1039 }
1040
1041 // get the indices within bounds
1042 pinIndices(start, length);
1043
1044 // find the first occurrence of the substring
1045 const UChar *array = getArrayStart();
1046 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1047 if(match == NULL) {
1048 return -1;
1049 } else {
1050 return (int32_t)(match - array);
1051 }
1052 }
1053
1054 int32_t
doIndexOf(UChar c,int32_t start,int32_t length) const1055 UnicodeString::doIndexOf(UChar c,
1056 int32_t start,
1057 int32_t length) const
1058 {
1059 // pin indices
1060 pinIndices(start, length);
1061
1062 // find the first occurrence of c
1063 const UChar *array = getArrayStart();
1064 const UChar *match = u_memchr(array + start, c, length);
1065 if(match == NULL) {
1066 return -1;
1067 } else {
1068 return (int32_t)(match - array);
1069 }
1070 }
1071
1072 int32_t
doIndexOf(UChar32 c,int32_t start,int32_t length) const1073 UnicodeString::doIndexOf(UChar32 c,
1074 int32_t start,
1075 int32_t length) const {
1076 // pin indices
1077 pinIndices(start, length);
1078
1079 // find the first occurrence of c
1080 const UChar *array = getArrayStart();
1081 const UChar *match = u_memchr32(array + start, c, length);
1082 if(match == NULL) {
1083 return -1;
1084 } else {
1085 return (int32_t)(match - array);
1086 }
1087 }
1088
1089 int32_t
lastIndexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const1090 UnicodeString::lastIndexOf(const UChar *srcChars,
1091 int32_t srcStart,
1092 int32_t srcLength,
1093 int32_t start,
1094 int32_t length) const
1095 {
1096 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1097 return -1;
1098 }
1099
1100 // UnicodeString does not find empty substrings
1101 if(srcLength < 0 && srcChars[srcStart] == 0) {
1102 return -1;
1103 }
1104
1105 // get the indices within bounds
1106 pinIndices(start, length);
1107
1108 // find the last occurrence of the substring
1109 const UChar *array = getArrayStart();
1110 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1111 if(match == NULL) {
1112 return -1;
1113 } else {
1114 return (int32_t)(match - array);
1115 }
1116 }
1117
1118 int32_t
doLastIndexOf(UChar c,int32_t start,int32_t length) const1119 UnicodeString::doLastIndexOf(UChar c,
1120 int32_t start,
1121 int32_t length) const
1122 {
1123 if(isBogus()) {
1124 return -1;
1125 }
1126
1127 // pin indices
1128 pinIndices(start, length);
1129
1130 // find the last occurrence of c
1131 const UChar *array = getArrayStart();
1132 const UChar *match = u_memrchr(array + start, c, length);
1133 if(match == NULL) {
1134 return -1;
1135 } else {
1136 return (int32_t)(match - array);
1137 }
1138 }
1139
1140 int32_t
doLastIndexOf(UChar32 c,int32_t start,int32_t length) const1141 UnicodeString::doLastIndexOf(UChar32 c,
1142 int32_t start,
1143 int32_t length) const {
1144 // pin indices
1145 pinIndices(start, length);
1146
1147 // find the last occurrence of c
1148 const UChar *array = getArrayStart();
1149 const UChar *match = u_memrchr32(array + start, c, length);
1150 if(match == NULL) {
1151 return -1;
1152 } else {
1153 return (int32_t)(match - array);
1154 }
1155 }
1156
1157 //========================================
1158 // Write implementation
1159 //========================================
1160
1161 UnicodeString&
findAndReplace(int32_t start,int32_t length,const UnicodeString & oldText,int32_t oldStart,int32_t oldLength,const UnicodeString & newText,int32_t newStart,int32_t newLength)1162 UnicodeString::findAndReplace(int32_t start,
1163 int32_t length,
1164 const UnicodeString& oldText,
1165 int32_t oldStart,
1166 int32_t oldLength,
1167 const UnicodeString& newText,
1168 int32_t newStart,
1169 int32_t newLength)
1170 {
1171 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1172 return *this;
1173 }
1174
1175 pinIndices(start, length);
1176 oldText.pinIndices(oldStart, oldLength);
1177 newText.pinIndices(newStart, newLength);
1178
1179 if(oldLength == 0) {
1180 return *this;
1181 }
1182
1183 while(length > 0 && length >= oldLength) {
1184 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1185 if(pos < 0) {
1186 // no more oldText's here: done
1187 break;
1188 } else {
1189 // we found oldText, replace it by newText and go beyond it
1190 replace(pos, oldLength, newText, newStart, newLength);
1191 length -= pos + oldLength - start;
1192 start = pos + newLength;
1193 }
1194 }
1195
1196 return *this;
1197 }
1198
1199
1200 void
setToBogus()1201 UnicodeString::setToBogus()
1202 {
1203 releaseArray();
1204
1205 fUnion.fFields.fLengthAndFlags = kIsBogus;
1206 fUnion.fFields.fArray = 0;
1207 fUnion.fFields.fCapacity = 0;
1208 }
1209
1210 // turn a bogus string into an empty one
1211 void
unBogus()1212 UnicodeString::unBogus() {
1213 if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1214 setToEmpty();
1215 }
1216 }
1217
1218 const char16_t *
getTerminatedBuffer()1219 UnicodeString::getTerminatedBuffer() {
1220 if(!isWritable()) {
1221 return nullptr;
1222 }
1223 UChar *array = getArrayStart();
1224 int32_t len = length();
1225 if(len < getCapacity()) {
1226 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1227 // If len<capacity on a read-only alias, then array[len] is
1228 // either the original NUL (if constructed with (TRUE, s, length))
1229 // or one of the original string contents characters (if later truncated),
1230 // therefore we can assume that array[len] is initialized memory.
1231 if(array[len] == 0) {
1232 return array;
1233 }
1234 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1235 // kRefCounted: Do not write the NUL if the buffer is shared.
1236 // That is mostly safe, except when the length of one copy was modified
1237 // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1238 // Then the NUL would be written into the middle of another copy's string.
1239
1240 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1241 // Do not test if there is a NUL already because it might be uninitialized memory.
1242 // (That would be safe, but tools like valgrind & Purify would complain.)
1243 array[len] = 0;
1244 return array;
1245 }
1246 }
1247 if(len<INT32_MAX && cloneArrayIfNeeded(len+1)) {
1248 array = getArrayStart();
1249 array[len] = 0;
1250 return array;
1251 } else {
1252 return nullptr;
1253 }
1254 }
1255
1256 // setTo() analogous to the readonly-aliasing constructor with the same signature
1257 UnicodeString &
setTo(UBool isTerminated,ConstChar16Ptr textPtr,int32_t textLength)1258 UnicodeString::setTo(UBool isTerminated,
1259 ConstChar16Ptr textPtr,
1260 int32_t textLength)
1261 {
1262 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1263 // do not modify a string that has an "open" getBuffer(minCapacity)
1264 return *this;
1265 }
1266
1267 const UChar *text = textPtr;
1268 if(text == NULL) {
1269 // treat as an empty string, do not alias
1270 releaseArray();
1271 setToEmpty();
1272 return *this;
1273 }
1274
1275 if( textLength < -1 ||
1276 (textLength == -1 && !isTerminated) ||
1277 (textLength >= 0 && isTerminated && text[textLength] != 0)
1278 ) {
1279 setToBogus();
1280 return *this;
1281 }
1282
1283 releaseArray();
1284
1285 if(textLength == -1) {
1286 // text is terminated, or else it would have failed the above test
1287 textLength = u_strlen(text);
1288 }
1289 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1290 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1291 return *this;
1292 }
1293
1294 // setTo() analogous to the writable-aliasing constructor with the same signature
1295 UnicodeString &
setTo(UChar * buffer,int32_t buffLength,int32_t buffCapacity)1296 UnicodeString::setTo(UChar *buffer,
1297 int32_t buffLength,
1298 int32_t buffCapacity) {
1299 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1300 // do not modify a string that has an "open" getBuffer(minCapacity)
1301 return *this;
1302 }
1303
1304 if(buffer == NULL) {
1305 // treat as an empty string, do not alias
1306 releaseArray();
1307 setToEmpty();
1308 return *this;
1309 }
1310
1311 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1312 setToBogus();
1313 return *this;
1314 } else if(buffLength == -1) {
1315 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1316 const UChar *p = buffer, *limit = buffer + buffCapacity;
1317 while(p != limit && *p != 0) {
1318 ++p;
1319 }
1320 buffLength = (int32_t)(p - buffer);
1321 }
1322
1323 releaseArray();
1324
1325 fUnion.fFields.fLengthAndFlags = kWritableAlias;
1326 setArray(buffer, buffLength, buffCapacity);
1327 return *this;
1328 }
1329
setToUTF8(StringPiece utf8)1330 UnicodeString &UnicodeString::setToUTF8(StringPiece utf8) {
1331 unBogus();
1332 int32_t length = utf8.length();
1333 int32_t capacity;
1334 // The UTF-16 string will be at most as long as the UTF-8 string.
1335 if(length <= US_STACKBUF_SIZE) {
1336 capacity = US_STACKBUF_SIZE;
1337 } else {
1338 capacity = length + 1; // +1 for the terminating NUL.
1339 }
1340 UChar *utf16 = getBuffer(capacity);
1341 int32_t length16;
1342 UErrorCode errorCode = U_ZERO_ERROR;
1343 u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1344 utf8.data(), length,
1345 0xfffd, // Substitution character.
1346 NULL, // Don't care about number of substitutions.
1347 &errorCode);
1348 releaseBuffer(length16);
1349 if(U_FAILURE(errorCode)) {
1350 setToBogus();
1351 }
1352 return *this;
1353 }
1354
1355 UnicodeString&
setCharAt(int32_t offset,UChar c)1356 UnicodeString::setCharAt(int32_t offset,
1357 UChar c)
1358 {
1359 int32_t len = length();
1360 if(cloneArrayIfNeeded() && len > 0) {
1361 if(offset < 0) {
1362 offset = 0;
1363 } else if(offset >= len) {
1364 offset = len - 1;
1365 }
1366
1367 getArrayStart()[offset] = c;
1368 }
1369 return *this;
1370 }
1371
1372 UnicodeString&
replace(int32_t start,int32_t _length,UChar32 srcChar)1373 UnicodeString::replace(int32_t start,
1374 int32_t _length,
1375 UChar32 srcChar) {
1376 UChar buffer[U16_MAX_LENGTH];
1377 int32_t count = 0;
1378 UBool isError = FALSE;
1379 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1380 // We test isError so that the compiler does not complain that we don't.
1381 // If isError (srcChar is not a valid code point) then count==0 which means
1382 // we remove the source segment rather than replacing it with srcChar.
1383 return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1384 }
1385
1386 UnicodeString&
append(UChar32 srcChar)1387 UnicodeString::append(UChar32 srcChar) {
1388 UChar buffer[U16_MAX_LENGTH];
1389 int32_t _length = 0;
1390 UBool isError = FALSE;
1391 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1392 // We test isError so that the compiler does not complain that we don't.
1393 // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1394 return isError ? *this : doAppend(buffer, 0, _length);
1395 }
1396
1397 UnicodeString&
doReplace(int32_t start,int32_t length,const UnicodeString & src,int32_t srcStart,int32_t srcLength)1398 UnicodeString::doReplace( int32_t start,
1399 int32_t length,
1400 const UnicodeString& src,
1401 int32_t srcStart,
1402 int32_t srcLength)
1403 {
1404 // pin the indices to legal values
1405 src.pinIndices(srcStart, srcLength);
1406
1407 // get the characters from src
1408 // and replace the range in ourselves with them
1409 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1410 }
1411
1412 UnicodeString&
doReplace(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength)1413 UnicodeString::doReplace(int32_t start,
1414 int32_t length,
1415 const UChar *srcChars,
1416 int32_t srcStart,
1417 int32_t srcLength)
1418 {
1419 if(!isWritable()) {
1420 return *this;
1421 }
1422
1423 int32_t oldLength = this->length();
1424
1425 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1426 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1427 if(start == 0) {
1428 // remove prefix by adjusting the array pointer
1429 pinIndex(length);
1430 fUnion.fFields.fArray += length;
1431 fUnion.fFields.fCapacity -= length;
1432 setLength(oldLength - length);
1433 return *this;
1434 } else {
1435 pinIndex(start);
1436 if(length >= (oldLength - start)) {
1437 // remove suffix by reducing the length (like truncate())
1438 setLength(start);
1439 fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1440 return *this;
1441 }
1442 }
1443 }
1444
1445 if(start == oldLength) {
1446 return doAppend(srcChars, srcStart, srcLength);
1447 }
1448
1449 if(srcChars == 0) {
1450 srcLength = 0;
1451 } else {
1452 // Perform all remaining operations relative to srcChars + srcStart.
1453 // From this point forward, do not use srcStart.
1454 srcChars += srcStart;
1455 if (srcLength < 0) {
1456 // get the srcLength if necessary
1457 srcLength = u_strlen(srcChars);
1458 }
1459 }
1460
1461 // pin the indices to legal values
1462 pinIndices(start, length);
1463
1464 // Calculate the size of the string after the replace.
1465 // Avoid int32_t overflow.
1466 int32_t newLength = oldLength - length;
1467 if(srcLength > (INT32_MAX - newLength)) {
1468 setToBogus();
1469 return *this;
1470 }
1471 newLength += srcLength;
1472
1473 // Check for insertion into ourself
1474 const UChar *oldArray = getArrayStart();
1475 if (isBufferWritable() &&
1476 oldArray < srcChars + srcLength &&
1477 srcChars < oldArray + oldLength) {
1478 // Copy into a new UnicodeString and start over
1479 UnicodeString copy(srcChars, srcLength);
1480 if (copy.isBogus()) {
1481 setToBogus();
1482 return *this;
1483 }
1484 return doReplace(start, length, copy.getArrayStart(), 0, srcLength);
1485 }
1486
1487 // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1488 // therefore we need to keep the current fArray
1489 UChar oldStackBuffer[US_STACKBUF_SIZE];
1490 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1491 // copy the stack buffer contents because it will be overwritten with
1492 // fUnion.fFields values
1493 u_memcpy(oldStackBuffer, oldArray, oldLength);
1494 oldArray = oldStackBuffer;
1495 }
1496
1497 // clone our array and allocate a bigger array if needed
1498 int32_t *bufferToDelete = 0;
1499 if(!cloneArrayIfNeeded(newLength, getGrowCapacity(newLength),
1500 FALSE, &bufferToDelete)
1501 ) {
1502 return *this;
1503 }
1504
1505 // now do the replace
1506
1507 UChar *newArray = getArrayStart();
1508 if(newArray != oldArray) {
1509 // if fArray changed, then we need to copy everything except what will change
1510 us_arrayCopy(oldArray, 0, newArray, 0, start);
1511 us_arrayCopy(oldArray, start + length,
1512 newArray, start + srcLength,
1513 oldLength - (start + length));
1514 } else if(length != srcLength) {
1515 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1516 us_arrayCopy(oldArray, start + length,
1517 newArray, start + srcLength,
1518 oldLength - (start + length));
1519 }
1520
1521 // now fill in the hole with the new string
1522 us_arrayCopy(srcChars, 0, newArray, start, srcLength);
1523
1524 setLength(newLength);
1525
1526 // delayed delete in case srcChars == fArray when we started, and
1527 // to keep oldArray alive for the above operations
1528 if (bufferToDelete) {
1529 uprv_free(bufferToDelete);
1530 }
1531
1532 return *this;
1533 }
1534
1535 // Versions of doReplace() only for append() variants.
1536 // doReplace() and doAppend() optimize for different cases.
1537
1538 UnicodeString&
doAppend(const UnicodeString & src,int32_t srcStart,int32_t srcLength)1539 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1540 if(srcLength == 0) {
1541 return *this;
1542 }
1543
1544 // pin the indices to legal values
1545 src.pinIndices(srcStart, srcLength);
1546 return doAppend(src.getArrayStart(), srcStart, srcLength);
1547 }
1548
1549 UnicodeString&
doAppend(const UChar * srcChars,int32_t srcStart,int32_t srcLength)1550 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1551 if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1552 return *this;
1553 }
1554
1555 // Perform all remaining operations relative to srcChars + srcStart.
1556 // From this point forward, do not use srcStart.
1557 srcChars += srcStart;
1558
1559 if(srcLength < 0) {
1560 // get the srcLength if necessary
1561 if((srcLength = u_strlen(srcChars)) == 0) {
1562 return *this;
1563 }
1564 }
1565
1566 int32_t oldLength = length();
1567 int32_t newLength = oldLength + srcLength;
1568
1569 // Check for append onto ourself
1570 const UChar* oldArray = getArrayStart();
1571 if (isBufferWritable() &&
1572 oldArray < srcChars + srcLength &&
1573 srcChars < oldArray + oldLength) {
1574 // Copy into a new UnicodeString and start over
1575 UnicodeString copy(srcChars, srcLength);
1576 if (copy.isBogus()) {
1577 setToBogus();
1578 return *this;
1579 }
1580 return doAppend(copy.getArrayStart(), 0, srcLength);
1581 }
1582
1583 // optimize append() onto a large-enough, owned string
1584 if((newLength <= getCapacity() && isBufferWritable()) ||
1585 cloneArrayIfNeeded(newLength, getGrowCapacity(newLength))) {
1586 UChar *newArray = getArrayStart();
1587 // Do not copy characters when
1588 // UChar *buffer=str.getAppendBuffer(...);
1589 // is followed by
1590 // str.append(buffer, length);
1591 // or
1592 // str.appendString(buffer, length)
1593 // or similar.
1594 if(srcChars != newArray + oldLength) {
1595 us_arrayCopy(srcChars, 0, newArray, oldLength, srcLength);
1596 }
1597 setLength(newLength);
1598 }
1599 return *this;
1600 }
1601
1602 /**
1603 * Replaceable API
1604 */
1605 void
handleReplaceBetween(int32_t start,int32_t limit,const UnicodeString & text)1606 UnicodeString::handleReplaceBetween(int32_t start,
1607 int32_t limit,
1608 const UnicodeString& text) {
1609 replaceBetween(start, limit, text);
1610 }
1611
1612 /**
1613 * Replaceable API
1614 */
1615 void
copy(int32_t start,int32_t limit,int32_t dest)1616 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1617 if (limit <= start) {
1618 return; // Nothing to do; avoid bogus malloc call
1619 }
1620 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1621 // Check to make sure text is not null.
1622 if (text != NULL) {
1623 extractBetween(start, limit, text, 0);
1624 insert(dest, text, 0, limit - start);
1625 uprv_free(text);
1626 }
1627 }
1628
1629 /**
1630 * Replaceable API
1631 *
1632 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1633 * so we implement this function here.
1634 */
hasMetaData() const1635 UBool Replaceable::hasMetaData() const {
1636 return TRUE;
1637 }
1638
1639 /**
1640 * Replaceable API
1641 */
hasMetaData() const1642 UBool UnicodeString::hasMetaData() const {
1643 return FALSE;
1644 }
1645
1646 UnicodeString&
doReverse(int32_t start,int32_t length)1647 UnicodeString::doReverse(int32_t start, int32_t length) {
1648 if(length <= 1 || !cloneArrayIfNeeded()) {
1649 return *this;
1650 }
1651
1652 // pin the indices to legal values
1653 pinIndices(start, length);
1654 if(length <= 1) { // pinIndices() might have shrunk the length
1655 return *this;
1656 }
1657
1658 UChar *left = getArrayStart() + start;
1659 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1660 UChar swap;
1661 UBool hasSupplementary = FALSE;
1662
1663 // Before the loop we know left<right because length>=2.
1664 do {
1665 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1666 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1667 *right-- = swap;
1668 } while(left < right);
1669 // Make sure to test the middle code unit of an odd-length string.
1670 // Redundant if the length is even.
1671 hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1672
1673 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1674 if(hasSupplementary) {
1675 UChar swap2;
1676
1677 left = getArrayStart() + start;
1678 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1679 while(left < right) {
1680 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1681 *left++ = swap2;
1682 *left++ = swap;
1683 } else {
1684 ++left;
1685 }
1686 }
1687 }
1688
1689 return *this;
1690 }
1691
1692 UBool
padLeading(int32_t targetLength,UChar padChar)1693 UnicodeString::padLeading(int32_t targetLength,
1694 UChar padChar)
1695 {
1696 int32_t oldLength = length();
1697 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1698 return FALSE;
1699 } else {
1700 // move contents up by padding width
1701 UChar *array = getArrayStart();
1702 int32_t start = targetLength - oldLength;
1703 us_arrayCopy(array, 0, array, start, oldLength);
1704
1705 // fill in padding character
1706 while(--start >= 0) {
1707 array[start] = padChar;
1708 }
1709 setLength(targetLength);
1710 return TRUE;
1711 }
1712 }
1713
1714 UBool
padTrailing(int32_t targetLength,UChar padChar)1715 UnicodeString::padTrailing(int32_t targetLength,
1716 UChar padChar)
1717 {
1718 int32_t oldLength = length();
1719 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1720 return FALSE;
1721 } else {
1722 // fill in padding character
1723 UChar *array = getArrayStart();
1724 int32_t length = targetLength;
1725 while(--length >= oldLength) {
1726 array[length] = padChar;
1727 }
1728 setLength(targetLength);
1729 return TRUE;
1730 }
1731 }
1732
1733 //========================================
1734 // Hashing
1735 //========================================
1736 int32_t
doHashCode() const1737 UnicodeString::doHashCode() const
1738 {
1739 /* Delegate hash computation to uhash. This makes UnicodeString
1740 * hashing consistent with UChar* hashing. */
1741 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1742 if (hashCode == kInvalidHashCode) {
1743 hashCode = kEmptyHashCode;
1744 }
1745 return hashCode;
1746 }
1747
1748 //========================================
1749 // External Buffer
1750 //========================================
1751
1752 char16_t *
getBuffer(int32_t minCapacity)1753 UnicodeString::getBuffer(int32_t minCapacity) {
1754 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1755 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1756 setZeroLength();
1757 return getArrayStart();
1758 } else {
1759 return nullptr;
1760 }
1761 }
1762
1763 void
releaseBuffer(int32_t newLength)1764 UnicodeString::releaseBuffer(int32_t newLength) {
1765 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1766 // set the new fLength
1767 int32_t capacity=getCapacity();
1768 if(newLength==-1) {
1769 // the new length is the string length, capped by fCapacity
1770 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1771 while(p<limit && *p!=0) {
1772 ++p;
1773 }
1774 newLength=(int32_t)(p-array);
1775 } else if(newLength>capacity) {
1776 newLength=capacity;
1777 }
1778 setLength(newLength);
1779 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1780 }
1781 }
1782
1783 //========================================
1784 // Miscellaneous
1785 //========================================
1786 UBool
cloneArrayIfNeeded(int32_t newCapacity,int32_t growCapacity,UBool doCopyArray,int32_t ** pBufferToDelete,UBool forceClone)1787 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1788 int32_t growCapacity,
1789 UBool doCopyArray,
1790 int32_t **pBufferToDelete,
1791 UBool forceClone) {
1792 // default parameters need to be static, therefore
1793 // the defaults are -1 to have convenience defaults
1794 if(newCapacity == -1) {
1795 newCapacity = getCapacity();
1796 }
1797
1798 // while a getBuffer(minCapacity) is "open",
1799 // prevent any modifications of the string by returning FALSE here
1800 // if the string is bogus, then only an assignment or similar can revive it
1801 if(!isWritable()) {
1802 return FALSE;
1803 }
1804
1805 /*
1806 * We need to make a copy of the array if
1807 * the buffer is read-only, or
1808 * the buffer is refCounted (shared), and refCount>1, or
1809 * the buffer is too small.
1810 * Return FALSE if memory could not be allocated.
1811 */
1812 if(forceClone ||
1813 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1814 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1815 newCapacity > getCapacity()
1816 ) {
1817 // check growCapacity for default value and use of the stack buffer
1818 if(growCapacity < 0) {
1819 growCapacity = newCapacity;
1820 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1821 growCapacity = US_STACKBUF_SIZE;
1822 }
1823
1824 // save old values
1825 UChar oldStackBuffer[US_STACKBUF_SIZE];
1826 UChar *oldArray;
1827 int32_t oldLength = length();
1828 int16_t flags = fUnion.fFields.fLengthAndFlags;
1829
1830 if(flags&kUsingStackBuffer) {
1831 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1832 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1833 // copy the stack buffer contents because it will be overwritten with
1834 // fUnion.fFields values
1835 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1836 oldArray = oldStackBuffer;
1837 } else {
1838 oldArray = NULL; // no need to copy from the stack buffer to itself
1839 }
1840 } else {
1841 oldArray = fUnion.fFields.fArray;
1842 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1843 }
1844
1845 // allocate a new array
1846 if(allocate(growCapacity) ||
1847 (newCapacity < growCapacity && allocate(newCapacity))
1848 ) {
1849 if(doCopyArray) {
1850 // copy the contents
1851 // do not copy more than what fits - it may be smaller than before
1852 int32_t minLength = oldLength;
1853 newCapacity = getCapacity();
1854 if(newCapacity < minLength) {
1855 minLength = newCapacity;
1856 }
1857 if(oldArray != NULL) {
1858 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1859 }
1860 setLength(minLength);
1861 } else {
1862 setZeroLength();
1863 }
1864
1865 // release the old array
1866 if(flags & kRefCounted) {
1867 // the array is refCounted; decrement and release if 0
1868 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1869 if(umtx_atomic_dec(pRefCount) == 0) {
1870 if(pBufferToDelete == 0) {
1871 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1872 // is defined as volatile. (Volatile has useful non-standard behavior
1873 // with this compiler.)
1874 uprv_free((void *)pRefCount);
1875 } else {
1876 // the caller requested to delete it himself
1877 *pBufferToDelete = (int32_t *)pRefCount;
1878 }
1879 }
1880 }
1881 } else {
1882 // not enough memory for growCapacity and not even for the smaller newCapacity
1883 // reset the old values for setToBogus() to release the array
1884 if(!(flags&kUsingStackBuffer)) {
1885 fUnion.fFields.fArray = oldArray;
1886 }
1887 fUnion.fFields.fLengthAndFlags = flags;
1888 setToBogus();
1889 return FALSE;
1890 }
1891 }
1892 return TRUE;
1893 }
1894
1895 // UnicodeStringAppendable ------------------------------------------------- ***
1896
~UnicodeStringAppendable()1897 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1898
1899 UBool
appendCodeUnit(UChar c)1900 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1901 return str.doAppend(&c, 0, 1).isWritable();
1902 }
1903
1904 UBool
appendCodePoint(UChar32 c)1905 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1906 UChar buffer[U16_MAX_LENGTH];
1907 int32_t cLength = 0;
1908 UBool isError = FALSE;
1909 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1910 return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1911 }
1912
1913 UBool
appendString(const UChar * s,int32_t length)1914 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1915 return str.doAppend(s, 0, length).isWritable();
1916 }
1917
1918 UBool
reserveAppendCapacity(int32_t appendCapacity)1919 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1920 return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1921 }
1922
1923 UChar *
getAppendBuffer(int32_t minCapacity,int32_t desiredCapacityHint,UChar * scratch,int32_t scratchCapacity,int32_t * resultCapacity)1924 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1925 int32_t desiredCapacityHint,
1926 UChar *scratch, int32_t scratchCapacity,
1927 int32_t *resultCapacity) {
1928 if(minCapacity < 1 || scratchCapacity < minCapacity) {
1929 *resultCapacity = 0;
1930 return NULL;
1931 }
1932 int32_t oldLength = str.length();
1933 if(minCapacity <= (kMaxCapacity - oldLength) &&
1934 desiredCapacityHint <= (kMaxCapacity - oldLength) &&
1935 str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1936 *resultCapacity = str.getCapacity() - oldLength;
1937 return str.getArrayStart() + oldLength;
1938 }
1939 *resultCapacity = scratchCapacity;
1940 return scratch;
1941 }
1942
1943 U_NAMESPACE_END
1944
1945 U_NAMESPACE_USE
1946
1947 U_CAPI int32_t U_EXPORT2
uhash_hashUnicodeString(const UElement key)1948 uhash_hashUnicodeString(const UElement key) {
1949 const UnicodeString *str = (const UnicodeString*) key.pointer;
1950 return (str == NULL) ? 0 : str->hashCode();
1951 }
1952
1953 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1954 // does not depend on hashtable code.
1955 U_CAPI UBool U_EXPORT2
uhash_compareUnicodeString(const UElement key1,const UElement key2)1956 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1957 const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1958 const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1959 if (str1 == str2) {
1960 return TRUE;
1961 }
1962 if (str1 == NULL || str2 == NULL) {
1963 return FALSE;
1964 }
1965 return *str1 == *str2;
1966 }
1967
1968 #ifdef U_STATIC_IMPLEMENTATION
1969 /*
1970 This should never be called. It is defined here to make sure that the
1971 virtual vector deleting destructor is defined within unistr.cpp.
1972 The vector deleting destructor is already a part of UObject,
1973 but defining it here makes sure that it is included with this object file.
1974 This makes sure that static library dependencies are kept to a minimum.
1975 */
uprv_UnicodeStringDummy(void)1976 static void uprv_UnicodeStringDummy(void) {
1977 delete [] (new UnicodeString[2]);
1978 }
1979 #endif
1980