1 /*
2 ******************************************************************************
3 * Copyright (C) 1999-2015, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File unistr.cpp
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 09/25/98 stephen Creation.
13 * 04/20/99 stephen Overhauled per 4/16 code review.
14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
16 * Replaceable.
17 * 06/25/01 grhoten Removed the dependency on iostream
18 ******************************************************************************
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/appendable.h"
23 #include "unicode/putil.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
30 #include "uelement.h"
31 #include "ustr_imp.h"
32 #include "umutex.h"
33 #include "uassert.h"
34
35 #if 0
36
37 #include <iostream>
38 using namespace std;
39
40 //DEBUGGING
41 void
42 print(const UnicodeString& s,
43 const char *name)
44 {
45 UChar c;
46 cout << name << ":|";
47 for(int i = 0; i < s.length(); ++i) {
48 c = s[i];
49 if(c>= 0x007E || c < 0x0020)
50 cout << "[0x" << hex << s[i] << "]";
51 else
52 cout << (char) s[i];
53 }
54 cout << '|' << endl;
55 }
56
57 void
58 print(const UChar *s,
59 int32_t len,
60 const char *name)
61 {
62 UChar c;
63 cout << name << ":|";
64 for(int i = 0; i < len; ++i) {
65 c = s[i];
66 if(c>= 0x007E || c < 0x0020)
67 cout << "[0x" << hex << s[i] << "]";
68 else
69 cout << (char) s[i];
70 }
71 cout << '|' << endl;
72 }
73 // END DEBUGGING
74 #endif
75
76 // Local function definitions for now
77
78 // need to copy areas that may overlap
79 static
80 inline void
us_arrayCopy(const UChar * src,int32_t srcStart,UChar * dst,int32_t dstStart,int32_t count)81 us_arrayCopy(const UChar *src, int32_t srcStart,
82 UChar *dst, int32_t dstStart, int32_t count)
83 {
84 if(count>0) {
85 uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
86 }
87 }
88
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
90 U_CDECL_BEGIN
91 static UChar U_CALLCONV
UnicodeString_charAt(int32_t offset,void * context)92 UnicodeString_charAt(int32_t offset, void *context) {
93 return ((icu::UnicodeString*) context)->charAt(offset);
94 }
95 U_CDECL_END
96
97 U_NAMESPACE_BEGIN
98
99 /* The Replaceable virtual destructor can't be defined in the header
100 due to how AIX works with multiple definitions of virtual functions.
101 */
~Replaceable()102 Replaceable::~Replaceable() {}
103
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
105
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
108 return
109 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
110 append(s1).
111 append(s2);
112 }
113
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 // have a chance to automatically inline.
117 //========================================
118
119 void
addRef()120 UnicodeString::addRef() {
121 umtx_atomic_inc((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
122 }
123
124 int32_t
removeRef()125 UnicodeString::removeRef() {
126 return umtx_atomic_dec((u_atomic_int32_t *)fUnion.fFields.fArray - 1);
127 }
128
129 int32_t
refCount() const130 UnicodeString::refCount() const {
131 return umtx_loadAcquire(*((u_atomic_int32_t *)fUnion.fFields.fArray - 1));
132 }
133
134 void
releaseArray()135 UnicodeString::releaseArray() {
136 if((fUnion.fFields.fLengthAndFlags & kRefCounted) && removeRef() == 0) {
137 uprv_free((int32_t *)fUnion.fFields.fArray - 1);
138 }
139 }
140
141
142
143 //========================================
144 // Constructors
145 //========================================
146
147 // The default constructor is inline in unistr.h.
148
UnicodeString(int32_t capacity,UChar32 c,int32_t count)149 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count) {
150 fUnion.fFields.fLengthAndFlags = 0;
151 if(count <= 0 || (uint32_t)c > 0x10ffff) {
152 // just allocate and do not do anything else
153 allocate(capacity);
154 } else {
155 // count > 0, allocate and fill the new string with count c's
156 int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
157 if(capacity < length) {
158 capacity = length;
159 }
160 if(allocate(capacity)) {
161 UChar *array = getArrayStart();
162 int32_t i = 0;
163
164 // fill the new string with c
165 if(unitCount == 1) {
166 // fill with length UChars
167 while(i < length) {
168 array[i++] = (UChar)c;
169 }
170 } else {
171 // get the code units for c
172 UChar units[U16_MAX_LENGTH];
173 U16_APPEND_UNSAFE(units, i, c);
174
175 // now it must be i==unitCount
176 i = 0;
177
178 // for Unicode, unitCount can only be 1, 2, 3, or 4
179 // 1 is handled above
180 while(i < length) {
181 int32_t unitIdx = 0;
182 while(unitIdx < unitCount) {
183 array[i++]=units[unitIdx++];
184 }
185 }
186 }
187 }
188 setLength(length);
189 }
190 }
191
UnicodeString(UChar ch)192 UnicodeString::UnicodeString(UChar ch) {
193 fUnion.fFields.fLengthAndFlags = kLength1 | kShortString;
194 fUnion.fStackFields.fBuffer[0] = ch;
195 }
196
UnicodeString(UChar32 ch)197 UnicodeString::UnicodeString(UChar32 ch) {
198 fUnion.fFields.fLengthAndFlags = kShortString;
199 int32_t i = 0;
200 UBool isError = FALSE;
201 U16_APPEND(fUnion.fStackFields.fBuffer, i, US_STACKBUF_SIZE, ch, isError);
202 // We test isError so that the compiler does not complain that we don't.
203 // If isError then i==0 which is what we want anyway.
204 if(!isError) {
205 setShortLength(i);
206 }
207 }
208
UnicodeString(const UChar * text)209 UnicodeString::UnicodeString(const UChar *text) {
210 fUnion.fFields.fLengthAndFlags = kShortString;
211 doAppend(text, 0, -1);
212 }
213
UnicodeString(const UChar * text,int32_t textLength)214 UnicodeString::UnicodeString(const UChar *text,
215 int32_t textLength) {
216 fUnion.fFields.fLengthAndFlags = kShortString;
217 doAppend(text, 0, textLength);
218 }
219
UnicodeString(UBool isTerminated,const UChar * text,int32_t textLength)220 UnicodeString::UnicodeString(UBool isTerminated,
221 const UChar *text,
222 int32_t textLength) {
223 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
224 if(text == NULL) {
225 // treat as an empty string, do not alias
226 setToEmpty();
227 } else if(textLength < -1 ||
228 (textLength == -1 && !isTerminated) ||
229 (textLength >= 0 && isTerminated && text[textLength] != 0)
230 ) {
231 setToBogus();
232 } else {
233 if(textLength == -1) {
234 // text is terminated, or else it would have failed the above test
235 textLength = u_strlen(text);
236 }
237 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
238 }
239 }
240
UnicodeString(UChar * buff,int32_t buffLength,int32_t buffCapacity)241 UnicodeString::UnicodeString(UChar *buff,
242 int32_t buffLength,
243 int32_t buffCapacity) {
244 fUnion.fFields.fLengthAndFlags = kWritableAlias;
245 if(buff == NULL) {
246 // treat as an empty string, do not alias
247 setToEmpty();
248 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
249 setToBogus();
250 } else {
251 if(buffLength == -1) {
252 // fLength = u_strlen(buff); but do not look beyond buffCapacity
253 const UChar *p = buff, *limit = buff + buffCapacity;
254 while(p != limit && *p != 0) {
255 ++p;
256 }
257 buffLength = (int32_t)(p - buff);
258 }
259 setArray(buff, buffLength, buffCapacity);
260 }
261 }
262
UnicodeString(const char * src,int32_t length,EInvariant)263 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant) {
264 fUnion.fFields.fLengthAndFlags = kShortString;
265 if(src==NULL) {
266 // treat as an empty string
267 } else {
268 if(length<0) {
269 length=(int32_t)uprv_strlen(src);
270 }
271 if(cloneArrayIfNeeded(length, length, FALSE)) {
272 u_charsToUChars(src, getArrayStart(), length);
273 setLength(length);
274 } else {
275 setToBogus();
276 }
277 }
278 }
279
280 #if U_CHARSET_IS_UTF8
281
UnicodeString(const char * codepageData)282 UnicodeString::UnicodeString(const char *codepageData) {
283 fUnion.fFields.fLengthAndFlags = kShortString;
284 if(codepageData != 0) {
285 setToUTF8(codepageData);
286 }
287 }
288
UnicodeString(const char * codepageData,int32_t dataLength)289 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength) {
290 fUnion.fFields.fLengthAndFlags = kShortString;
291 // if there's nothing to convert, do nothing
292 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
293 return;
294 }
295 if(dataLength == -1) {
296 dataLength = (int32_t)uprv_strlen(codepageData);
297 }
298 setToUTF8(StringPiece(codepageData, dataLength));
299 }
300
301 // else see unistr_cnv.cpp
302 #endif
303
UnicodeString(const UnicodeString & that)304 UnicodeString::UnicodeString(const UnicodeString& that) {
305 fUnion.fFields.fLengthAndFlags = kShortString;
306 copyFrom(that);
307 }
308
309 #if U_HAVE_RVALUE_REFERENCES
UnicodeString(UnicodeString && src)310 UnicodeString::UnicodeString(UnicodeString &&src) U_NOEXCEPT {
311 fUnion.fFields.fLengthAndFlags = kShortString;
312 moveFrom(src);
313 }
314 #endif
315
UnicodeString(const UnicodeString & that,int32_t srcStart)316 UnicodeString::UnicodeString(const UnicodeString& that,
317 int32_t srcStart) {
318 fUnion.fFields.fLengthAndFlags = kShortString;
319 setTo(that, srcStart);
320 }
321
UnicodeString(const UnicodeString & that,int32_t srcStart,int32_t srcLength)322 UnicodeString::UnicodeString(const UnicodeString& that,
323 int32_t srcStart,
324 int32_t srcLength) {
325 fUnion.fFields.fLengthAndFlags = kShortString;
326 setTo(that, srcStart, srcLength);
327 }
328
329 // Replaceable base class clone() default implementation, does not clone
330 Replaceable *
clone() const331 Replaceable::clone() const {
332 return NULL;
333 }
334
335 // UnicodeString overrides clone() with a real implementation
336 Replaceable *
clone() const337 UnicodeString::clone() const {
338 return new UnicodeString(*this);
339 }
340
341 //========================================
342 // array allocation
343 //========================================
344
345 UBool
allocate(int32_t capacity)346 UnicodeString::allocate(int32_t capacity) {
347 if(capacity <= US_STACKBUF_SIZE) {
348 fUnion.fFields.fLengthAndFlags = kShortString;
349 } else {
350 // count bytes for the refCounter and the string capacity, and
351 // round up to a multiple of 16; then divide by 4 and allocate int32_t's
352 // to be safely aligned for the refCount
353 // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
354 int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
355 int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
356 if(array != 0) {
357 // set initial refCount and point behind the refCount
358 *array++ = 1;
359
360 // have fArray point to the first UChar
361 fUnion.fFields.fArray = (UChar *)array;
362 fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
363 fUnion.fFields.fLengthAndFlags = kLongString;
364 } else {
365 fUnion.fFields.fLengthAndFlags = kIsBogus;
366 fUnion.fFields.fArray = 0;
367 fUnion.fFields.fCapacity = 0;
368 return FALSE;
369 }
370 }
371 return TRUE;
372 }
373
374 //========================================
375 // Destructor
376 //========================================
377
378 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
379 static u_atomic_int32_t finalLengthCounts[0x400]; // UnicodeString::kMaxShortLength+1
380 static u_atomic_int32_t beyondCount(0);
381
unistr_printLengths()382 U_CAPI void unistr_printLengths() {
383 int32_t i;
384 for(i = 0; i <= 59; ++i) {
385 printf("%2d, %9d\n", i, (int32_t)finalLengthCounts[i]);
386 }
387 int32_t beyond = beyondCount;
388 for(; i < UPRV_LENGTHOF(finalLengthCounts); ++i) {
389 beyond += finalLengthCounts[i];
390 }
391 printf(">59, %9d\n", beyond);
392 }
393 #endif
394
~UnicodeString()395 UnicodeString::~UnicodeString()
396 {
397 #ifdef UNISTR_COUNT_FINAL_STRING_LENGTHS
398 // Count lengths of strings at the end of their lifetime.
399 // Useful for discussion of a desirable stack buffer size.
400 // Count the contents length, not the optional NUL terminator nor further capacity.
401 // Ignore open-buffer strings and strings which alias external storage.
402 if((fUnion.fFields.fLengthAndFlags&(kOpenGetBuffer|kReadonlyAlias|kWritableAlias)) == 0) {
403 if(hasShortLength()) {
404 umtx_atomic_inc(finalLengthCounts + getShortLength());
405 } else {
406 umtx_atomic_inc(&beyondCount);
407 }
408 }
409 #endif
410
411 releaseArray();
412 }
413
414 //========================================
415 // Factory methods
416 //========================================
417
fromUTF8(const StringPiece & utf8)418 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
419 UnicodeString result;
420 result.setToUTF8(utf8);
421 return result;
422 }
423
fromUTF32(const UChar32 * utf32,int32_t length)424 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
425 UnicodeString result;
426 int32_t capacity;
427 // Most UTF-32 strings will be BMP-only and result in a same-length
428 // UTF-16 string. We overestimate the capacity just slightly,
429 // just in case there are a few supplementary characters.
430 if(length <= US_STACKBUF_SIZE) {
431 capacity = US_STACKBUF_SIZE;
432 } else {
433 capacity = length + (length >> 4) + 4;
434 }
435 do {
436 UChar *utf16 = result.getBuffer(capacity);
437 int32_t length16;
438 UErrorCode errorCode = U_ZERO_ERROR;
439 u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
440 utf32, length,
441 0xfffd, // Substitution character.
442 NULL, // Don't care about number of substitutions.
443 &errorCode);
444 result.releaseBuffer(length16);
445 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
446 capacity = length16 + 1; // +1 for the terminating NUL.
447 continue;
448 } else if(U_FAILURE(errorCode)) {
449 result.setToBogus();
450 }
451 break;
452 } while(TRUE);
453 return result;
454 }
455
456 //========================================
457 // Assignment
458 //========================================
459
460 UnicodeString &
operator =(const UnicodeString & src)461 UnicodeString::operator=(const UnicodeString &src) {
462 return copyFrom(src);
463 }
464
465 UnicodeString &
fastCopyFrom(const UnicodeString & src)466 UnicodeString::fastCopyFrom(const UnicodeString &src) {
467 return copyFrom(src, TRUE);
468 }
469
470 UnicodeString &
copyFrom(const UnicodeString & src,UBool fastCopy)471 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
472 // if assigning to ourselves, do nothing
473 if(this == &src) {
474 return *this;
475 }
476
477 // is the right side bogus?
478 if(src.isBogus()) {
479 setToBogus();
480 return *this;
481 }
482
483 // delete the current contents
484 releaseArray();
485
486 if(src.isEmpty()) {
487 // empty string - use the stack buffer
488 setToEmpty();
489 return *this;
490 }
491
492 // fLength>0 and not an "open" src.getBuffer(minCapacity)
493 fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
494 switch(src.fUnion.fFields.fLengthAndFlags & kAllStorageFlags) {
495 case kShortString:
496 // short string using the stack buffer, do the same
497 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
498 getShortLength() * U_SIZEOF_UCHAR);
499 break;
500 case kLongString:
501 // src uses a refCounted string buffer, use that buffer with refCount
502 // src is const, use a cast - we don't actually change it
503 ((UnicodeString &)src).addRef();
504 // copy all fields, share the reference-counted buffer
505 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
506 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
507 if(!hasShortLength()) {
508 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
509 }
510 break;
511 case kReadonlyAlias:
512 if(fastCopy) {
513 // src is a readonly alias, do the same
514 // -> maintain the readonly alias as such
515 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
516 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
517 if(!hasShortLength()) {
518 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
519 }
520 break;
521 }
522 // else if(!fastCopy) fall through to case kWritableAlias
523 // -> allocate a new buffer and copy the contents
524 case kWritableAlias: {
525 // src is a writable alias; we make a copy of that instead
526 int32_t srcLength = src.length();
527 if(allocate(srcLength)) {
528 uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
529 setLength(srcLength);
530 break;
531 }
532 // if there is not enough memory, then fall through to setting to bogus
533 }
534 default:
535 // if src is bogus, set ourselves to bogus
536 // do not call setToBogus() here because fArray and flags are not consistent here
537 fUnion.fFields.fLengthAndFlags = kIsBogus;
538 fUnion.fFields.fArray = 0;
539 fUnion.fFields.fCapacity = 0;
540 break;
541 }
542
543 return *this;
544 }
545
moveFrom(UnicodeString & src)546 UnicodeString &UnicodeString::moveFrom(UnicodeString &src) U_NOEXCEPT {
547 // No explicit check for self move assignment, consistent with standard library.
548 // Self move assignment causes no crash nor leak but might make the object bogus.
549 releaseArray();
550 copyFieldsFrom(src, TRUE);
551 return *this;
552 }
553
554 // Same as moveFrom() except without memory management.
copyFieldsFrom(UnicodeString & src,UBool setSrcToBogus)555 void UnicodeString::copyFieldsFrom(UnicodeString &src, UBool setSrcToBogus) U_NOEXCEPT {
556 int16_t lengthAndFlags = fUnion.fFields.fLengthAndFlags = src.fUnion.fFields.fLengthAndFlags;
557 if(lengthAndFlags & kUsingStackBuffer) {
558 // Short string using the stack buffer, copy the contents.
559 // Check for self assignment to prevent "overlap in memcpy" warnings,
560 // although it should be harmless to copy a buffer to itself exactly.
561 if(this != &src) {
562 uprv_memcpy(fUnion.fStackFields.fBuffer, src.fUnion.fStackFields.fBuffer,
563 getShortLength() * U_SIZEOF_UCHAR);
564 }
565 } else {
566 // In all other cases, copy all fields.
567 fUnion.fFields.fArray = src.fUnion.fFields.fArray;
568 fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
569 if(!hasShortLength()) {
570 fUnion.fFields.fLength = src.fUnion.fFields.fLength;
571 }
572 if(setSrcToBogus) {
573 // Set src to bogus without releasing any memory.
574 src.fUnion.fFields.fLengthAndFlags = kIsBogus;
575 src.fUnion.fFields.fArray = NULL;
576 src.fUnion.fFields.fCapacity = 0;
577 }
578 }
579 }
580
swap(UnicodeString & other)581 void UnicodeString::swap(UnicodeString &other) U_NOEXCEPT {
582 UnicodeString temp; // Empty short string: Known not to need releaseArray().
583 // Copy fields without resetting source values in between.
584 temp.copyFieldsFrom(*this, FALSE);
585 this->copyFieldsFrom(other, FALSE);
586 other.copyFieldsFrom(temp, FALSE);
587 // Set temp to an empty string so that other's memory is not released twice.
588 temp.fUnion.fFields.fLengthAndFlags = kShortString;
589 }
590
591 //========================================
592 // Miscellaneous operations
593 //========================================
594
unescape() const595 UnicodeString UnicodeString::unescape() const {
596 UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
597 if (result.isBogus()) {
598 return result;
599 }
600 const UChar *array = getBuffer();
601 int32_t len = length();
602 int32_t prev = 0;
603 for (int32_t i=0;;) {
604 if (i == len) {
605 result.append(array, prev, len - prev);
606 break;
607 }
608 if (array[i++] == 0x5C /*'\\'*/) {
609 result.append(array, prev, (i - 1) - prev);
610 UChar32 c = unescapeAt(i); // advances i
611 if (c < 0) {
612 result.remove(); // return empty string
613 break; // invalid escape sequence
614 }
615 result.append(c);
616 prev = i;
617 }
618 }
619 return result;
620 }
621
unescapeAt(int32_t & offset) const622 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
623 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
624 }
625
626 //========================================
627 // Read-only implementation
628 //========================================
629 UBool
doEquals(const UnicodeString & text,int32_t len) const630 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
631 // Requires: this & text not bogus and have same lengths.
632 // Byte-wise comparison works for equality regardless of endianness.
633 return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
634 }
635
636 int8_t
doCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const637 UnicodeString::doCompare( int32_t start,
638 int32_t length,
639 const UChar *srcChars,
640 int32_t srcStart,
641 int32_t srcLength) const
642 {
643 // compare illegal string values
644 if(isBogus()) {
645 return -1;
646 }
647
648 // pin indices to legal values
649 pinIndices(start, length);
650
651 if(srcChars == NULL) {
652 // treat const UChar *srcChars==NULL as an empty string
653 return length == 0 ? 0 : 1;
654 }
655
656 // get the correct pointer
657 const UChar *chars = getArrayStart();
658
659 chars += start;
660 srcChars += srcStart;
661
662 int32_t minLength;
663 int8_t lengthResult;
664
665 // get the srcLength if necessary
666 if(srcLength < 0) {
667 srcLength = u_strlen(srcChars + srcStart);
668 }
669
670 // are we comparing different lengths?
671 if(length != srcLength) {
672 if(length < srcLength) {
673 minLength = length;
674 lengthResult = -1;
675 } else {
676 minLength = srcLength;
677 lengthResult = 1;
678 }
679 } else {
680 minLength = length;
681 lengthResult = 0;
682 }
683
684 /*
685 * note that uprv_memcmp() returns an int but we return an int8_t;
686 * we need to take care not to truncate the result -
687 * one way to do this is to right-shift the value to
688 * move the sign bit into the lower 8 bits and making sure that this
689 * does not become 0 itself
690 */
691
692 if(minLength > 0 && chars != srcChars) {
693 int32_t result;
694
695 # if U_IS_BIG_ENDIAN
696 // big-endian: byte comparison works
697 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
698 if(result != 0) {
699 return (int8_t)(result >> 15 | 1);
700 }
701 # else
702 // little-endian: compare UChar units
703 do {
704 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
705 if(result != 0) {
706 return (int8_t)(result >> 15 | 1);
707 }
708 } while(--minLength > 0);
709 # endif
710 }
711 return lengthResult;
712 }
713
714 /* String compare in code point order - doCompare() compares in code unit order. */
715 int8_t
doCompareCodePointOrder(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const716 UnicodeString::doCompareCodePointOrder(int32_t start,
717 int32_t length,
718 const UChar *srcChars,
719 int32_t srcStart,
720 int32_t srcLength) const
721 {
722 // compare illegal string values
723 // treat const UChar *srcChars==NULL as an empty string
724 if(isBogus()) {
725 return -1;
726 }
727
728 // pin indices to legal values
729 pinIndices(start, length);
730
731 if(srcChars == NULL) {
732 srcStart = srcLength = 0;
733 }
734
735 int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
736 /* translate the 32-bit result into an 8-bit one */
737 if(diff!=0) {
738 return (int8_t)(diff >> 15 | 1);
739 } else {
740 return 0;
741 }
742 }
743
744 int32_t
getLength() const745 UnicodeString::getLength() const {
746 return length();
747 }
748
749 UChar
getCharAt(int32_t offset) const750 UnicodeString::getCharAt(int32_t offset) const {
751 return charAt(offset);
752 }
753
754 UChar32
getChar32At(int32_t offset) const755 UnicodeString::getChar32At(int32_t offset) const {
756 return char32At(offset);
757 }
758
759 UChar32
char32At(int32_t offset) const760 UnicodeString::char32At(int32_t offset) const
761 {
762 int32_t len = length();
763 if((uint32_t)offset < (uint32_t)len) {
764 const UChar *array = getArrayStart();
765 UChar32 c;
766 U16_GET(array, 0, offset, len, c);
767 return c;
768 } else {
769 return kInvalidUChar;
770 }
771 }
772
773 int32_t
getChar32Start(int32_t offset) const774 UnicodeString::getChar32Start(int32_t offset) const {
775 if((uint32_t)offset < (uint32_t)length()) {
776 const UChar *array = getArrayStart();
777 U16_SET_CP_START(array, 0, offset);
778 return offset;
779 } else {
780 return 0;
781 }
782 }
783
784 int32_t
getChar32Limit(int32_t offset) const785 UnicodeString::getChar32Limit(int32_t offset) const {
786 int32_t len = length();
787 if((uint32_t)offset < (uint32_t)len) {
788 const UChar *array = getArrayStart();
789 U16_SET_CP_LIMIT(array, 0, offset, len);
790 return offset;
791 } else {
792 return len;
793 }
794 }
795
796 int32_t
countChar32(int32_t start,int32_t length) const797 UnicodeString::countChar32(int32_t start, int32_t length) const {
798 pinIndices(start, length);
799 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
800 return u_countChar32(getArrayStart()+start, length);
801 }
802
803 UBool
hasMoreChar32Than(int32_t start,int32_t length,int32_t number) const804 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
805 pinIndices(start, length);
806 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
807 return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
808 }
809
810 int32_t
moveIndex32(int32_t index,int32_t delta) const811 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
812 // pin index
813 int32_t len = length();
814 if(index<0) {
815 index=0;
816 } else if(index>len) {
817 index=len;
818 }
819
820 const UChar *array = getArrayStart();
821 if(delta>0) {
822 U16_FWD_N(array, index, len, delta);
823 } else {
824 U16_BACK_N(array, 0, index, -delta);
825 }
826
827 return index;
828 }
829
830 void
doExtract(int32_t start,int32_t length,UChar * dst,int32_t dstStart) const831 UnicodeString::doExtract(int32_t start,
832 int32_t length,
833 UChar *dst,
834 int32_t dstStart) const
835 {
836 // pin indices to legal values
837 pinIndices(start, length);
838
839 // do not copy anything if we alias dst itself
840 const UChar *array = getArrayStart();
841 if(array + start != dst + dstStart) {
842 us_arrayCopy(array, start, dst, dstStart, length);
843 }
844 }
845
846 int32_t
extract(UChar * dest,int32_t destCapacity,UErrorCode & errorCode) const847 UnicodeString::extract(UChar *dest, int32_t destCapacity,
848 UErrorCode &errorCode) const {
849 int32_t len = length();
850 if(U_SUCCESS(errorCode)) {
851 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
852 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
853 } else {
854 const UChar *array = getArrayStart();
855 if(len>0 && len<=destCapacity && array!=dest) {
856 uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
857 }
858 return u_terminateUChars(dest, destCapacity, len, &errorCode);
859 }
860 }
861
862 return len;
863 }
864
865 int32_t
extract(int32_t start,int32_t length,char * target,int32_t targetCapacity,enum EInvariant) const866 UnicodeString::extract(int32_t start,
867 int32_t length,
868 char *target,
869 int32_t targetCapacity,
870 enum EInvariant) const
871 {
872 // if the arguments are illegal, then do nothing
873 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
874 return 0;
875 }
876
877 // pin the indices to legal values
878 pinIndices(start, length);
879
880 if(length <= targetCapacity) {
881 u_UCharsToChars(getArrayStart() + start, target, length);
882 }
883 UErrorCode status = U_ZERO_ERROR;
884 return u_terminateChars(target, targetCapacity, length, &status);
885 }
886
887 UnicodeString
tempSubString(int32_t start,int32_t len) const888 UnicodeString::tempSubString(int32_t start, int32_t len) const {
889 pinIndices(start, len);
890 const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
891 if(array==NULL) {
892 array=fUnion.fStackFields.fBuffer; // anything not NULL because that would make an empty string
893 len=-2; // bogus result string
894 }
895 return UnicodeString(FALSE, array + start, len);
896 }
897
898 int32_t
toUTF8(int32_t start,int32_t len,char * target,int32_t capacity) const899 UnicodeString::toUTF8(int32_t start, int32_t len,
900 char *target, int32_t capacity) const {
901 pinIndices(start, len);
902 int32_t length8;
903 UErrorCode errorCode = U_ZERO_ERROR;
904 u_strToUTF8WithSub(target, capacity, &length8,
905 getBuffer() + start, len,
906 0xFFFD, // Standard substitution character.
907 NULL, // Don't care about number of substitutions.
908 &errorCode);
909 return length8;
910 }
911
912 #if U_CHARSET_IS_UTF8
913
914 int32_t
extract(int32_t start,int32_t len,char * target,uint32_t dstSize) const915 UnicodeString::extract(int32_t start, int32_t len,
916 char *target, uint32_t dstSize) const {
917 // if the arguments are illegal, then do nothing
918 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
919 return 0;
920 }
921 return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
922 }
923
924 // else see unistr_cnv.cpp
925 #endif
926
927 void
extractBetween(int32_t start,int32_t limit,UnicodeString & target) const928 UnicodeString::extractBetween(int32_t start,
929 int32_t limit,
930 UnicodeString& target) const {
931 pinIndex(start);
932 pinIndex(limit);
933 doExtract(start, limit - start, target);
934 }
935
936 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
937 // as many bytes as the source has UChars.
938 // The "worst cases" are writing systems like Indic, Thai and CJK with
939 // 3:1 bytes:UChars.
940 void
toUTF8(ByteSink & sink) const941 UnicodeString::toUTF8(ByteSink &sink) const {
942 int32_t length16 = length();
943 if(length16 != 0) {
944 char stackBuffer[1024];
945 int32_t capacity = (int32_t)sizeof(stackBuffer);
946 UBool utf8IsOwned = FALSE;
947 char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
948 3*length16,
949 stackBuffer, capacity,
950 &capacity);
951 int32_t length8 = 0;
952 UErrorCode errorCode = U_ZERO_ERROR;
953 u_strToUTF8WithSub(utf8, capacity, &length8,
954 getBuffer(), length16,
955 0xFFFD, // Standard substitution character.
956 NULL, // Don't care about number of substitutions.
957 &errorCode);
958 if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
959 utf8 = (char *)uprv_malloc(length8);
960 if(utf8 != NULL) {
961 utf8IsOwned = TRUE;
962 errorCode = U_ZERO_ERROR;
963 u_strToUTF8WithSub(utf8, length8, &length8,
964 getBuffer(), length16,
965 0xFFFD, // Standard substitution character.
966 NULL, // Don't care about number of substitutions.
967 &errorCode);
968 } else {
969 errorCode = U_MEMORY_ALLOCATION_ERROR;
970 }
971 }
972 if(U_SUCCESS(errorCode)) {
973 sink.Append(utf8, length8);
974 sink.Flush();
975 }
976 if(utf8IsOwned) {
977 uprv_free(utf8);
978 }
979 }
980 }
981
982 int32_t
toUTF32(UChar32 * utf32,int32_t capacity,UErrorCode & errorCode) const983 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
984 int32_t length32=0;
985 if(U_SUCCESS(errorCode)) {
986 // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
987 u_strToUTF32WithSub(utf32, capacity, &length32,
988 getBuffer(), length(),
989 0xfffd, // Substitution character.
990 NULL, // Don't care about number of substitutions.
991 &errorCode);
992 }
993 return length32;
994 }
995
996 int32_t
indexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const997 UnicodeString::indexOf(const UChar *srcChars,
998 int32_t srcStart,
999 int32_t srcLength,
1000 int32_t start,
1001 int32_t length) const
1002 {
1003 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1004 return -1;
1005 }
1006
1007 // UnicodeString does not find empty substrings
1008 if(srcLength < 0 && srcChars[srcStart] == 0) {
1009 return -1;
1010 }
1011
1012 // get the indices within bounds
1013 pinIndices(start, length);
1014
1015 // find the first occurrence of the substring
1016 const UChar *array = getArrayStart();
1017 const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
1018 if(match == NULL) {
1019 return -1;
1020 } else {
1021 return (int32_t)(match - array);
1022 }
1023 }
1024
1025 int32_t
doIndexOf(UChar c,int32_t start,int32_t length) const1026 UnicodeString::doIndexOf(UChar c,
1027 int32_t start,
1028 int32_t length) const
1029 {
1030 // pin indices
1031 pinIndices(start, length);
1032
1033 // find the first occurrence of c
1034 const UChar *array = getArrayStart();
1035 const UChar *match = u_memchr(array + start, c, length);
1036 if(match == NULL) {
1037 return -1;
1038 } else {
1039 return (int32_t)(match - array);
1040 }
1041 }
1042
1043 int32_t
doIndexOf(UChar32 c,int32_t start,int32_t length) const1044 UnicodeString::doIndexOf(UChar32 c,
1045 int32_t start,
1046 int32_t length) const {
1047 // pin indices
1048 pinIndices(start, length);
1049
1050 // find the first occurrence of c
1051 const UChar *array = getArrayStart();
1052 const UChar *match = u_memchr32(array + start, c, length);
1053 if(match == NULL) {
1054 return -1;
1055 } else {
1056 return (int32_t)(match - array);
1057 }
1058 }
1059
1060 int32_t
lastIndexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const1061 UnicodeString::lastIndexOf(const UChar *srcChars,
1062 int32_t srcStart,
1063 int32_t srcLength,
1064 int32_t start,
1065 int32_t length) const
1066 {
1067 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1068 return -1;
1069 }
1070
1071 // UnicodeString does not find empty substrings
1072 if(srcLength < 0 && srcChars[srcStart] == 0) {
1073 return -1;
1074 }
1075
1076 // get the indices within bounds
1077 pinIndices(start, length);
1078
1079 // find the last occurrence of the substring
1080 const UChar *array = getArrayStart();
1081 const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1082 if(match == NULL) {
1083 return -1;
1084 } else {
1085 return (int32_t)(match - array);
1086 }
1087 }
1088
1089 int32_t
doLastIndexOf(UChar c,int32_t start,int32_t length) const1090 UnicodeString::doLastIndexOf(UChar c,
1091 int32_t start,
1092 int32_t length) const
1093 {
1094 if(isBogus()) {
1095 return -1;
1096 }
1097
1098 // pin indices
1099 pinIndices(start, length);
1100
1101 // find the last occurrence of c
1102 const UChar *array = getArrayStart();
1103 const UChar *match = u_memrchr(array + start, c, length);
1104 if(match == NULL) {
1105 return -1;
1106 } else {
1107 return (int32_t)(match - array);
1108 }
1109 }
1110
1111 int32_t
doLastIndexOf(UChar32 c,int32_t start,int32_t length) const1112 UnicodeString::doLastIndexOf(UChar32 c,
1113 int32_t start,
1114 int32_t length) const {
1115 // pin indices
1116 pinIndices(start, length);
1117
1118 // find the last occurrence of c
1119 const UChar *array = getArrayStart();
1120 const UChar *match = u_memrchr32(array + start, c, length);
1121 if(match == NULL) {
1122 return -1;
1123 } else {
1124 return (int32_t)(match - array);
1125 }
1126 }
1127
1128 //========================================
1129 // Write implementation
1130 //========================================
1131
1132 UnicodeString&
findAndReplace(int32_t start,int32_t length,const UnicodeString & oldText,int32_t oldStart,int32_t oldLength,const UnicodeString & newText,int32_t newStart,int32_t newLength)1133 UnicodeString::findAndReplace(int32_t start,
1134 int32_t length,
1135 const UnicodeString& oldText,
1136 int32_t oldStart,
1137 int32_t oldLength,
1138 const UnicodeString& newText,
1139 int32_t newStart,
1140 int32_t newLength)
1141 {
1142 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1143 return *this;
1144 }
1145
1146 pinIndices(start, length);
1147 oldText.pinIndices(oldStart, oldLength);
1148 newText.pinIndices(newStart, newLength);
1149
1150 if(oldLength == 0) {
1151 return *this;
1152 }
1153
1154 while(length > 0 && length >= oldLength) {
1155 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1156 if(pos < 0) {
1157 // no more oldText's here: done
1158 break;
1159 } else {
1160 // we found oldText, replace it by newText and go beyond it
1161 replace(pos, oldLength, newText, newStart, newLength);
1162 length -= pos + oldLength - start;
1163 start = pos + newLength;
1164 }
1165 }
1166
1167 return *this;
1168 }
1169
1170
1171 void
setToBogus()1172 UnicodeString::setToBogus()
1173 {
1174 releaseArray();
1175
1176 fUnion.fFields.fLengthAndFlags = kIsBogus;
1177 fUnion.fFields.fArray = 0;
1178 fUnion.fFields.fCapacity = 0;
1179 }
1180
1181 // turn a bogus string into an empty one
1182 void
unBogus()1183 UnicodeString::unBogus() {
1184 if(fUnion.fFields.fLengthAndFlags & kIsBogus) {
1185 setToEmpty();
1186 }
1187 }
1188
1189 const UChar *
getTerminatedBuffer()1190 UnicodeString::getTerminatedBuffer() {
1191 if(!isWritable()) {
1192 return 0;
1193 }
1194 UChar *array = getArrayStart();
1195 int32_t len = length();
1196 if(len < getCapacity()) {
1197 if(fUnion.fFields.fLengthAndFlags & kBufferIsReadonly) {
1198 // If len<capacity on a read-only alias, then array[len] is
1199 // either the original NUL (if constructed with (TRUE, s, length))
1200 // or one of the original string contents characters (if later truncated),
1201 // therefore we can assume that array[len] is initialized memory.
1202 if(array[len] == 0) {
1203 return array;
1204 }
1205 } else if(((fUnion.fFields.fLengthAndFlags & kRefCounted) == 0 || refCount() == 1)) {
1206 // kRefCounted: Do not write the NUL if the buffer is shared.
1207 // That is mostly safe, except when the length of one copy was modified
1208 // without copy-on-write, e.g., via truncate(newLength) or remove(void).
1209 // Then the NUL would be written into the middle of another copy's string.
1210
1211 // Otherwise, the buffer is fully writable and it is anyway safe to write the NUL.
1212 // Do not test if there is a NUL already because it might be uninitialized memory.
1213 // (That would be safe, but tools like valgrind & Purify would complain.)
1214 array[len] = 0;
1215 return array;
1216 }
1217 }
1218 if(cloneArrayIfNeeded(len+1)) {
1219 array = getArrayStart();
1220 array[len] = 0;
1221 return array;
1222 } else {
1223 return NULL;
1224 }
1225 }
1226
1227 // setTo() analogous to the readonly-aliasing constructor with the same signature
1228 UnicodeString &
setTo(UBool isTerminated,const UChar * text,int32_t textLength)1229 UnicodeString::setTo(UBool isTerminated,
1230 const UChar *text,
1231 int32_t textLength)
1232 {
1233 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1234 // do not modify a string that has an "open" getBuffer(minCapacity)
1235 return *this;
1236 }
1237
1238 if(text == NULL) {
1239 // treat as an empty string, do not alias
1240 releaseArray();
1241 setToEmpty();
1242 return *this;
1243 }
1244
1245 if( textLength < -1 ||
1246 (textLength == -1 && !isTerminated) ||
1247 (textLength >= 0 && isTerminated && text[textLength] != 0)
1248 ) {
1249 setToBogus();
1250 return *this;
1251 }
1252
1253 releaseArray();
1254
1255 if(textLength == -1) {
1256 // text is terminated, or else it would have failed the above test
1257 textLength = u_strlen(text);
1258 }
1259 fUnion.fFields.fLengthAndFlags = kReadonlyAlias;
1260 setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1261 return *this;
1262 }
1263
1264 // setTo() analogous to the writable-aliasing constructor with the same signature
1265 UnicodeString &
setTo(UChar * buffer,int32_t buffLength,int32_t buffCapacity)1266 UnicodeString::setTo(UChar *buffer,
1267 int32_t buffLength,
1268 int32_t buffCapacity) {
1269 if(fUnion.fFields.fLengthAndFlags & kOpenGetBuffer) {
1270 // do not modify a string that has an "open" getBuffer(minCapacity)
1271 return *this;
1272 }
1273
1274 if(buffer == NULL) {
1275 // treat as an empty string, do not alias
1276 releaseArray();
1277 setToEmpty();
1278 return *this;
1279 }
1280
1281 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1282 setToBogus();
1283 return *this;
1284 } else if(buffLength == -1) {
1285 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1286 const UChar *p = buffer, *limit = buffer + buffCapacity;
1287 while(p != limit && *p != 0) {
1288 ++p;
1289 }
1290 buffLength = (int32_t)(p - buffer);
1291 }
1292
1293 releaseArray();
1294
1295 fUnion.fFields.fLengthAndFlags = kWritableAlias;
1296 setArray(buffer, buffLength, buffCapacity);
1297 return *this;
1298 }
1299
setToUTF8(const StringPiece & utf8)1300 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1301 unBogus();
1302 int32_t length = utf8.length();
1303 int32_t capacity;
1304 // The UTF-16 string will be at most as long as the UTF-8 string.
1305 if(length <= US_STACKBUF_SIZE) {
1306 capacity = US_STACKBUF_SIZE;
1307 } else {
1308 capacity = length + 1; // +1 for the terminating NUL.
1309 }
1310 UChar *utf16 = getBuffer(capacity);
1311 int32_t length16;
1312 UErrorCode errorCode = U_ZERO_ERROR;
1313 u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1314 utf8.data(), length,
1315 0xfffd, // Substitution character.
1316 NULL, // Don't care about number of substitutions.
1317 &errorCode);
1318 releaseBuffer(length16);
1319 if(U_FAILURE(errorCode)) {
1320 setToBogus();
1321 }
1322 return *this;
1323 }
1324
1325 UnicodeString&
setCharAt(int32_t offset,UChar c)1326 UnicodeString::setCharAt(int32_t offset,
1327 UChar c)
1328 {
1329 int32_t len = length();
1330 if(cloneArrayIfNeeded() && len > 0) {
1331 if(offset < 0) {
1332 offset = 0;
1333 } else if(offset >= len) {
1334 offset = len - 1;
1335 }
1336
1337 getArrayStart()[offset] = c;
1338 }
1339 return *this;
1340 }
1341
1342 UnicodeString&
replace(int32_t start,int32_t _length,UChar32 srcChar)1343 UnicodeString::replace(int32_t start,
1344 int32_t _length,
1345 UChar32 srcChar) {
1346 UChar buffer[U16_MAX_LENGTH];
1347 int32_t count = 0;
1348 UBool isError = FALSE;
1349 U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1350 // We test isError so that the compiler does not complain that we don't.
1351 // If isError (srcChar is not a valid code point) then count==0 which means
1352 // we remove the source segment rather than replacing it with srcChar.
1353 return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1354 }
1355
1356 UnicodeString&
append(UChar32 srcChar)1357 UnicodeString::append(UChar32 srcChar) {
1358 UChar buffer[U16_MAX_LENGTH];
1359 int32_t _length = 0;
1360 UBool isError = FALSE;
1361 U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1362 // We test isError so that the compiler does not complain that we don't.
1363 // If isError then _length==0 which turns the doAppend() into a no-op anyway.
1364 return isError ? *this : doAppend(buffer, 0, _length);
1365 }
1366
1367 UnicodeString&
doReplace(int32_t start,int32_t length,const UnicodeString & src,int32_t srcStart,int32_t srcLength)1368 UnicodeString::doReplace( int32_t start,
1369 int32_t length,
1370 const UnicodeString& src,
1371 int32_t srcStart,
1372 int32_t srcLength)
1373 {
1374 // pin the indices to legal values
1375 src.pinIndices(srcStart, srcLength);
1376
1377 // get the characters from src
1378 // and replace the range in ourselves with them
1379 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1380 }
1381
1382 UnicodeString&
doReplace(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength)1383 UnicodeString::doReplace(int32_t start,
1384 int32_t length,
1385 const UChar *srcChars,
1386 int32_t srcStart,
1387 int32_t srcLength)
1388 {
1389 if(!isWritable()) {
1390 return *this;
1391 }
1392
1393 int32_t oldLength = this->length();
1394
1395 // optimize (read-only alias).remove(0, start) and .remove(start, end)
1396 if((fUnion.fFields.fLengthAndFlags&kBufferIsReadonly) && srcLength == 0) {
1397 if(start == 0) {
1398 // remove prefix by adjusting the array pointer
1399 pinIndex(length);
1400 fUnion.fFields.fArray += length;
1401 fUnion.fFields.fCapacity -= length;
1402 setLength(oldLength - length);
1403 return *this;
1404 } else {
1405 pinIndex(start);
1406 if(length >= (oldLength - start)) {
1407 // remove suffix by reducing the length (like truncate())
1408 setLength(start);
1409 fUnion.fFields.fCapacity = start; // not NUL-terminated any more
1410 return *this;
1411 }
1412 }
1413 }
1414
1415 if(start == oldLength) {
1416 return doAppend(srcChars, srcStart, srcLength);
1417 }
1418
1419 if(srcChars == 0) {
1420 srcStart = srcLength = 0;
1421 } else if(srcLength < 0) {
1422 // get the srcLength if necessary
1423 srcLength = u_strlen(srcChars + srcStart);
1424 }
1425
1426 // pin the indices to legal values
1427 pinIndices(start, length);
1428
1429 // calculate the size of the string after the replace
1430 int32_t newLength = oldLength - length + srcLength;
1431
1432 // cloneArrayIfNeeded(doCopyArray=FALSE) may change fArray but will not copy the current contents;
1433 // therefore we need to keep the current fArray
1434 UChar oldStackBuffer[US_STACKBUF_SIZE];
1435 UChar *oldArray;
1436 if((fUnion.fFields.fLengthAndFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1437 // copy the stack buffer contents because it will be overwritten with
1438 // fUnion.fFields values
1439 u_memcpy(oldStackBuffer, fUnion.fStackFields.fBuffer, oldLength);
1440 oldArray = oldStackBuffer;
1441 } else {
1442 oldArray = getArrayStart();
1443 }
1444
1445 // clone our array and allocate a bigger array if needed
1446 int32_t *bufferToDelete = 0;
1447 if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1448 FALSE, &bufferToDelete)
1449 ) {
1450 return *this;
1451 }
1452
1453 // now do the replace
1454
1455 UChar *newArray = getArrayStart();
1456 if(newArray != oldArray) {
1457 // if fArray changed, then we need to copy everything except what will change
1458 us_arrayCopy(oldArray, 0, newArray, 0, start);
1459 us_arrayCopy(oldArray, start + length,
1460 newArray, start + srcLength,
1461 oldLength - (start + length));
1462 } else if(length != srcLength) {
1463 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1464 us_arrayCopy(oldArray, start + length,
1465 newArray, start + srcLength,
1466 oldLength - (start + length));
1467 }
1468
1469 // now fill in the hole with the new string
1470 us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1471
1472 setLength(newLength);
1473
1474 // delayed delete in case srcChars == fArray when we started, and
1475 // to keep oldArray alive for the above operations
1476 if (bufferToDelete) {
1477 uprv_free(bufferToDelete);
1478 }
1479
1480 return *this;
1481 }
1482
1483 // Versions of doReplace() only for append() variants.
1484 // doReplace() and doAppend() optimize for different cases.
1485
1486 UnicodeString&
doAppend(const UnicodeString & src,int32_t srcStart,int32_t srcLength)1487 UnicodeString::doAppend(const UnicodeString& src, int32_t srcStart, int32_t srcLength) {
1488 if(srcLength == 0) {
1489 return *this;
1490 }
1491
1492 // pin the indices to legal values
1493 src.pinIndices(srcStart, srcLength);
1494 return doAppend(src.getArrayStart(), srcStart, srcLength);
1495 }
1496
1497 UnicodeString&
doAppend(const UChar * srcChars,int32_t srcStart,int32_t srcLength)1498 UnicodeString::doAppend(const UChar *srcChars, int32_t srcStart, int32_t srcLength) {
1499 if(!isWritable() || srcLength == 0 || srcChars == NULL) {
1500 return *this;
1501 }
1502
1503 if(srcLength < 0) {
1504 // get the srcLength if necessary
1505 if((srcLength = u_strlen(srcChars + srcStart)) == 0) {
1506 return *this;
1507 }
1508 }
1509
1510 int32_t oldLength = length();
1511 int32_t newLength = oldLength + srcLength;
1512 // optimize append() onto a large-enough, owned string
1513 if((newLength <= getCapacity() && isBufferWritable()) ||
1514 cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize)) {
1515 UChar *newArray = getArrayStart();
1516 // Do not copy characters when
1517 // UChar *buffer=str.getAppendBuffer(...);
1518 // is followed by
1519 // str.append(buffer, length);
1520 // or
1521 // str.appendString(buffer, length)
1522 // or similar.
1523 if(srcChars + srcStart != newArray + oldLength) {
1524 us_arrayCopy(srcChars, srcStart, newArray, oldLength, srcLength);
1525 }
1526 setLength(newLength);
1527 }
1528 return *this;
1529 }
1530
1531 /**
1532 * Replaceable API
1533 */
1534 void
handleReplaceBetween(int32_t start,int32_t limit,const UnicodeString & text)1535 UnicodeString::handleReplaceBetween(int32_t start,
1536 int32_t limit,
1537 const UnicodeString& text) {
1538 replaceBetween(start, limit, text);
1539 }
1540
1541 /**
1542 * Replaceable API
1543 */
1544 void
copy(int32_t start,int32_t limit,int32_t dest)1545 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1546 if (limit <= start) {
1547 return; // Nothing to do; avoid bogus malloc call
1548 }
1549 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1550 // Check to make sure text is not null.
1551 if (text != NULL) {
1552 extractBetween(start, limit, text, 0);
1553 insert(dest, text, 0, limit - start);
1554 uprv_free(text);
1555 }
1556 }
1557
1558 /**
1559 * Replaceable API
1560 *
1561 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1562 * so we implement this function here.
1563 */
hasMetaData() const1564 UBool Replaceable::hasMetaData() const {
1565 return TRUE;
1566 }
1567
1568 /**
1569 * Replaceable API
1570 */
hasMetaData() const1571 UBool UnicodeString::hasMetaData() const {
1572 return FALSE;
1573 }
1574
1575 UnicodeString&
doReverse(int32_t start,int32_t length)1576 UnicodeString::doReverse(int32_t start, int32_t length) {
1577 if(length <= 1 || !cloneArrayIfNeeded()) {
1578 return *this;
1579 }
1580
1581 // pin the indices to legal values
1582 pinIndices(start, length);
1583 if(length <= 1) { // pinIndices() might have shrunk the length
1584 return *this;
1585 }
1586
1587 UChar *left = getArrayStart() + start;
1588 UChar *right = left + length - 1; // -1 for inclusive boundary (length>=2)
1589 UChar swap;
1590 UBool hasSupplementary = FALSE;
1591
1592 // Before the loop we know left<right because length>=2.
1593 do {
1594 hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1595 hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1596 *right-- = swap;
1597 } while(left < right);
1598 // Make sure to test the middle code unit of an odd-length string.
1599 // Redundant if the length is even.
1600 hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1601
1602 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1603 if(hasSupplementary) {
1604 UChar swap2;
1605
1606 left = getArrayStart() + start;
1607 right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1608 while(left < right) {
1609 if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1610 *left++ = swap2;
1611 *left++ = swap;
1612 } else {
1613 ++left;
1614 }
1615 }
1616 }
1617
1618 return *this;
1619 }
1620
1621 UBool
padLeading(int32_t targetLength,UChar padChar)1622 UnicodeString::padLeading(int32_t targetLength,
1623 UChar padChar)
1624 {
1625 int32_t oldLength = length();
1626 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1627 return FALSE;
1628 } else {
1629 // move contents up by padding width
1630 UChar *array = getArrayStart();
1631 int32_t start = targetLength - oldLength;
1632 us_arrayCopy(array, 0, array, start, oldLength);
1633
1634 // fill in padding character
1635 while(--start >= 0) {
1636 array[start] = padChar;
1637 }
1638 setLength(targetLength);
1639 return TRUE;
1640 }
1641 }
1642
1643 UBool
padTrailing(int32_t targetLength,UChar padChar)1644 UnicodeString::padTrailing(int32_t targetLength,
1645 UChar padChar)
1646 {
1647 int32_t oldLength = length();
1648 if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1649 return FALSE;
1650 } else {
1651 // fill in padding character
1652 UChar *array = getArrayStart();
1653 int32_t length = targetLength;
1654 while(--length >= oldLength) {
1655 array[length] = padChar;
1656 }
1657 setLength(targetLength);
1658 return TRUE;
1659 }
1660 }
1661
1662 //========================================
1663 // Hashing
1664 //========================================
1665 int32_t
doHashCode() const1666 UnicodeString::doHashCode() const
1667 {
1668 /* Delegate hash computation to uhash. This makes UnicodeString
1669 * hashing consistent with UChar* hashing. */
1670 int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1671 if (hashCode == kInvalidHashCode) {
1672 hashCode = kEmptyHashCode;
1673 }
1674 return hashCode;
1675 }
1676
1677 //========================================
1678 // External Buffer
1679 //========================================
1680
1681 UChar *
getBuffer(int32_t minCapacity)1682 UnicodeString::getBuffer(int32_t minCapacity) {
1683 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1684 fUnion.fFields.fLengthAndFlags|=kOpenGetBuffer;
1685 setZeroLength();
1686 return getArrayStart();
1687 } else {
1688 return 0;
1689 }
1690 }
1691
1692 void
releaseBuffer(int32_t newLength)1693 UnicodeString::releaseBuffer(int32_t newLength) {
1694 if(fUnion.fFields.fLengthAndFlags&kOpenGetBuffer && newLength>=-1) {
1695 // set the new fLength
1696 int32_t capacity=getCapacity();
1697 if(newLength==-1) {
1698 // the new length is the string length, capped by fCapacity
1699 const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1700 while(p<limit && *p!=0) {
1701 ++p;
1702 }
1703 newLength=(int32_t)(p-array);
1704 } else if(newLength>capacity) {
1705 newLength=capacity;
1706 }
1707 setLength(newLength);
1708 fUnion.fFields.fLengthAndFlags&=~kOpenGetBuffer;
1709 }
1710 }
1711
1712 //========================================
1713 // Miscellaneous
1714 //========================================
1715 UBool
cloneArrayIfNeeded(int32_t newCapacity,int32_t growCapacity,UBool doCopyArray,int32_t ** pBufferToDelete,UBool forceClone)1716 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1717 int32_t growCapacity,
1718 UBool doCopyArray,
1719 int32_t **pBufferToDelete,
1720 UBool forceClone) {
1721 // default parameters need to be static, therefore
1722 // the defaults are -1 to have convenience defaults
1723 if(newCapacity == -1) {
1724 newCapacity = getCapacity();
1725 }
1726
1727 // while a getBuffer(minCapacity) is "open",
1728 // prevent any modifications of the string by returning FALSE here
1729 // if the string is bogus, then only an assignment or similar can revive it
1730 if(!isWritable()) {
1731 return FALSE;
1732 }
1733
1734 /*
1735 * We need to make a copy of the array if
1736 * the buffer is read-only, or
1737 * the buffer is refCounted (shared), and refCount>1, or
1738 * the buffer is too small.
1739 * Return FALSE if memory could not be allocated.
1740 */
1741 if(forceClone ||
1742 fUnion.fFields.fLengthAndFlags & kBufferIsReadonly ||
1743 (fUnion.fFields.fLengthAndFlags & kRefCounted && refCount() > 1) ||
1744 newCapacity > getCapacity()
1745 ) {
1746 // check growCapacity for default value and use of the stack buffer
1747 if(growCapacity < 0) {
1748 growCapacity = newCapacity;
1749 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1750 growCapacity = US_STACKBUF_SIZE;
1751 }
1752
1753 // save old values
1754 UChar oldStackBuffer[US_STACKBUF_SIZE];
1755 UChar *oldArray;
1756 int32_t oldLength = length();
1757 int16_t flags = fUnion.fFields.fLengthAndFlags;
1758
1759 if(flags&kUsingStackBuffer) {
1760 U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1761 if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1762 // copy the stack buffer contents because it will be overwritten with
1763 // fUnion.fFields values
1764 us_arrayCopy(fUnion.fStackFields.fBuffer, 0, oldStackBuffer, 0, oldLength);
1765 oldArray = oldStackBuffer;
1766 } else {
1767 oldArray = NULL; // no need to copy from the stack buffer to itself
1768 }
1769 } else {
1770 oldArray = fUnion.fFields.fArray;
1771 U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1772 }
1773
1774 // allocate a new array
1775 if(allocate(growCapacity) ||
1776 (newCapacity < growCapacity && allocate(newCapacity))
1777 ) {
1778 if(doCopyArray) {
1779 // copy the contents
1780 // do not copy more than what fits - it may be smaller than before
1781 int32_t minLength = oldLength;
1782 newCapacity = getCapacity();
1783 if(newCapacity < minLength) {
1784 minLength = newCapacity;
1785 }
1786 if(oldArray != NULL) {
1787 us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1788 }
1789 setLength(minLength);
1790 } else {
1791 setZeroLength();
1792 }
1793
1794 // release the old array
1795 if(flags & kRefCounted) {
1796 // the array is refCounted; decrement and release if 0
1797 u_atomic_int32_t *pRefCount = ((u_atomic_int32_t *)oldArray - 1);
1798 if(umtx_atomic_dec(pRefCount) == 0) {
1799 if(pBufferToDelete == 0) {
1800 // Note: cast to (void *) is needed with MSVC, where u_atomic_int32_t
1801 // is defined as volatile. (Volatile has useful non-standard behavior
1802 // with this compiler.)
1803 uprv_free((void *)pRefCount);
1804 } else {
1805 // the caller requested to delete it himself
1806 *pBufferToDelete = (int32_t *)pRefCount;
1807 }
1808 }
1809 }
1810 } else {
1811 // not enough memory for growCapacity and not even for the smaller newCapacity
1812 // reset the old values for setToBogus() to release the array
1813 if(!(flags&kUsingStackBuffer)) {
1814 fUnion.fFields.fArray = oldArray;
1815 }
1816 fUnion.fFields.fLengthAndFlags = flags;
1817 setToBogus();
1818 return FALSE;
1819 }
1820 }
1821 return TRUE;
1822 }
1823
1824 // UnicodeStringAppendable ------------------------------------------------- ***
1825
~UnicodeStringAppendable()1826 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1827
1828 UBool
appendCodeUnit(UChar c)1829 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1830 return str.doAppend(&c, 0, 1).isWritable();
1831 }
1832
1833 UBool
appendCodePoint(UChar32 c)1834 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1835 UChar buffer[U16_MAX_LENGTH];
1836 int32_t cLength = 0;
1837 UBool isError = FALSE;
1838 U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1839 return !isError && str.doAppend(buffer, 0, cLength).isWritable();
1840 }
1841
1842 UBool
appendString(const UChar * s,int32_t length)1843 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1844 return str.doAppend(s, 0, length).isWritable();
1845 }
1846
1847 UBool
reserveAppendCapacity(int32_t appendCapacity)1848 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1849 return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1850 }
1851
1852 UChar *
getAppendBuffer(int32_t minCapacity,int32_t desiredCapacityHint,UChar * scratch,int32_t scratchCapacity,int32_t * resultCapacity)1853 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1854 int32_t desiredCapacityHint,
1855 UChar *scratch, int32_t scratchCapacity,
1856 int32_t *resultCapacity) {
1857 if(minCapacity < 1 || scratchCapacity < minCapacity) {
1858 *resultCapacity = 0;
1859 return NULL;
1860 }
1861 int32_t oldLength = str.length();
1862 if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1863 *resultCapacity = str.getCapacity() - oldLength;
1864 return str.getArrayStart() + oldLength;
1865 }
1866 *resultCapacity = scratchCapacity;
1867 return scratch;
1868 }
1869
1870 U_NAMESPACE_END
1871
1872 U_NAMESPACE_USE
1873
1874 U_CAPI int32_t U_EXPORT2
uhash_hashUnicodeString(const UElement key)1875 uhash_hashUnicodeString(const UElement key) {
1876 const UnicodeString *str = (const UnicodeString*) key.pointer;
1877 return (str == NULL) ? 0 : str->hashCode();
1878 }
1879
1880 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1881 // does not depend on hashtable code.
1882 U_CAPI UBool U_EXPORT2
uhash_compareUnicodeString(const UElement key1,const UElement key2)1883 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1884 const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1885 const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1886 if (str1 == str2) {
1887 return TRUE;
1888 }
1889 if (str1 == NULL || str2 == NULL) {
1890 return FALSE;
1891 }
1892 return *str1 == *str2;
1893 }
1894
1895 #ifdef U_STATIC_IMPLEMENTATION
1896 /*
1897 This should never be called. It is defined here to make sure that the
1898 virtual vector deleting destructor is defined within unistr.cpp.
1899 The vector deleting destructor is already a part of UObject,
1900 but defining it here makes sure that it is included with this object file.
1901 This makes sure that static library dependencies are kept to a minimum.
1902 */
uprv_UnicodeStringDummy(void)1903 static void uprv_UnicodeStringDummy(void) {
1904 delete [] (new UnicodeString[2]);
1905 }
1906 #endif
1907