1 /*
2 ******************************************************************************
3 * Copyright (C) 1999-2007, International Business Machines Corporation and *
4 * others. All Rights Reserved. *
5 ******************************************************************************
6 *
7 * File unistr.cpp
8 *
9 * Modification History:
10 *
11 * Date Name Description
12 * 09/25/98 stephen Creation.
13 * 04/20/99 stephen Overhauled per 4/16 code review.
14 * 07/09/99 stephen Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 * 11/18/99 aliu Added handleReplaceBetween() to make inherit from
16 * Replaceable.
17 * 06/25/01 grhoten Removed the dependency on iostream
18 ******************************************************************************
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/putil.h"
23 #include "cstring.h"
24 #include "cmemory.h"
25 #include "unicode/ustring.h"
26 #include "unicode/unistr.h"
27 #include "uhash.h"
28 #include "ustr_imp.h"
29 #include "umutex.h"
30
31 #if 0
32
33 #if U_IOSTREAM_SOURCE >= 199711
34 #include <iostream>
35 using namespace std;
36 #elif U_IOSTREAM_SOURCE >= 198506
37 #include <iostream.h>
38 #endif
39
40 //DEBUGGING
41 void
42 print(const UnicodeString& s,
43 const char *name)
44 {
45 UChar c;
46 cout << name << ":|";
47 for(int i = 0; i < s.length(); ++i) {
48 c = s[i];
49 if(c>= 0x007E || c < 0x0020)
50 cout << "[0x" << hex << s[i] << "]";
51 else
52 cout << (char) s[i];
53 }
54 cout << '|' << endl;
55 }
56
57 void
58 print(const UChar *s,
59 int32_t len,
60 const char *name)
61 {
62 UChar c;
63 cout << name << ":|";
64 for(int i = 0; i < len; ++i) {
65 c = s[i];
66 if(c>= 0x007E || c < 0x0020)
67 cout << "[0x" << hex << s[i] << "]";
68 else
69 cout << (char) s[i];
70 }
71 cout << '|' << endl;
72 }
73 // END DEBUGGING
74 #endif
75
76 // Local function definitions for now
77
78 // need to copy areas that may overlap
79 static
80 inline void
us_arrayCopy(const UChar * src,int32_t srcStart,UChar * dst,int32_t dstStart,int32_t count)81 us_arrayCopy(const UChar *src, int32_t srcStart,
82 UChar *dst, int32_t dstStart, int32_t count)
83 {
84 if(count>0) {
85 uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
86 }
87 }
88
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
90 U_CDECL_BEGIN
91 static UChar U_CALLCONV
UnicodeString_charAt(int32_t offset,void * context)92 UnicodeString_charAt(int32_t offset, void *context) {
93 return ((U_NAMESPACE_QUALIFIER UnicodeString*) context)->charAt(offset);
94 }
95 U_CDECL_END
96
97 U_NAMESPACE_BEGIN
98
99 /* The Replaceable virtual destructor can't be defined in the header
100 due to how AIX works with multiple definitions of virtual functions.
101 */
~Replaceable()102 Replaceable::~Replaceable() {}
Replaceable()103 Replaceable::Replaceable() {}
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
105
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
108 return
109 UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
110 append(s1).
111 append(s2);
112 }
113
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 // have a chance to automatically inline.
117 //========================================
118
119 void
addRef()120 UnicodeString::addRef()
121 { umtx_atomic_inc((int32_t *)fArray - 1);}
122
123 int32_t
removeRef()124 UnicodeString::removeRef()
125 { return umtx_atomic_dec((int32_t *)fArray - 1);}
126
127 int32_t
refCount() const128 UnicodeString::refCount() const
129 {
130 umtx_lock(NULL);
131 // Note: without the lock to force a memory barrier, we might see a very
132 // stale value on some multi-processor systems.
133 int32_t count = *((int32_t *)fArray - 1);
134 umtx_unlock(NULL);
135 return count;
136 }
137
138 void
releaseArray()139 UnicodeString::releaseArray() {
140 if((fFlags & kRefCounted) && removeRef() == 0) {
141 uprv_free((int32_t *)fArray - 1);
142 }
143 }
144
145
146
147 //========================================
148 // Constructors
149 //========================================
UnicodeString()150 UnicodeString::UnicodeString()
151 : fLength(0),
152 fCapacity(US_STACKBUF_SIZE),
153 fArray(fStackBuffer),
154 fFlags(kShortString)
155 {}
156
UnicodeString(int32_t capacity,UChar32 c,int32_t count)157 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
158 : fLength(0),
159 fCapacity(US_STACKBUF_SIZE),
160 fArray(0),
161 fFlags(0)
162 {
163 if(count <= 0 || (uint32_t)c > 0x10ffff) {
164 // just allocate and do not do anything else
165 allocate(capacity);
166 } else {
167 // count > 0, allocate and fill the new string with count c's
168 int32_t unitCount = UTF_CHAR_LENGTH(c), length = count * unitCount;
169 if(capacity < length) {
170 capacity = length;
171 }
172 if(allocate(capacity)) {
173 int32_t i = 0;
174
175 // fill the new string with c
176 if(unitCount == 1) {
177 // fill with length UChars
178 while(i < length) {
179 fArray[i++] = (UChar)c;
180 }
181 } else {
182 // get the code units for c
183 UChar units[UTF_MAX_CHAR_LENGTH];
184 UTF_APPEND_CHAR_UNSAFE(units, i, c);
185
186 // now it must be i==unitCount
187 i = 0;
188
189 // for Unicode, unitCount can only be 1, 2, 3, or 4
190 // 1 is handled above
191 while(i < length) {
192 int32_t unitIdx = 0;
193 while(unitIdx < unitCount) {
194 fArray[i++]=units[unitIdx++];
195 }
196 }
197 }
198 }
199 fLength = length;
200 }
201 }
202
UnicodeString(UChar ch)203 UnicodeString::UnicodeString(UChar ch)
204 : fLength(1),
205 fCapacity(US_STACKBUF_SIZE),
206 fArray(fStackBuffer),
207 fFlags(kShortString)
208 {
209 fStackBuffer[0] = ch;
210 }
211
UnicodeString(UChar32 ch)212 UnicodeString::UnicodeString(UChar32 ch)
213 : fLength(1),
214 fCapacity(US_STACKBUF_SIZE),
215 fArray(fStackBuffer),
216 fFlags(kShortString)
217 {
218 int32_t i = 0;
219 UBool isError = FALSE;
220 U16_APPEND(fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
221 fLength = i;
222 }
223
UnicodeString(const UChar * text)224 UnicodeString::UnicodeString(const UChar *text)
225 : fLength(0),
226 fCapacity(US_STACKBUF_SIZE),
227 fArray(fStackBuffer),
228 fFlags(kShortString)
229 {
230 doReplace(0, 0, text, 0, -1);
231 }
232
UnicodeString(const UChar * text,int32_t textLength)233 UnicodeString::UnicodeString(const UChar *text,
234 int32_t textLength)
235 : fLength(0),
236 fCapacity(US_STACKBUF_SIZE),
237 fArray(fStackBuffer),
238 fFlags(kShortString)
239 {
240 doReplace(0, 0, text, 0, textLength);
241 }
242
UnicodeString(UBool isTerminated,const UChar * text,int32_t textLength)243 UnicodeString::UnicodeString(UBool isTerminated,
244 const UChar *text,
245 int32_t textLength)
246 : fLength(textLength),
247 fCapacity(isTerminated ? textLength + 1 : textLength),
248 fArray((UChar *)text),
249 fFlags(kReadonlyAlias)
250 {
251 if(text == NULL) {
252 // treat as an empty string, do not alias
253 fLength = 0;
254 fCapacity = US_STACKBUF_SIZE;
255 fArray = fStackBuffer;
256 fFlags = kShortString;
257 } else if(textLength < -1 ||
258 (textLength == -1 && !isTerminated) ||
259 (textLength >= 0 && isTerminated && text[textLength] != 0)
260 ) {
261 setToBogus();
262 } else if(textLength == -1) {
263 // text is terminated, or else it would have failed the above test
264 fLength = u_strlen(text);
265 fCapacity = fLength + 1;
266 }
267 }
268
UnicodeString(UChar * buff,int32_t buffLength,int32_t buffCapacity)269 UnicodeString::UnicodeString(UChar *buff,
270 int32_t buffLength,
271 int32_t buffCapacity)
272 : fLength(buffLength),
273 fCapacity(buffCapacity),
274 fArray(buff),
275 fFlags(kWritableAlias)
276 {
277 if(buff == NULL) {
278 // treat as an empty string, do not alias
279 fLength = 0;
280 fCapacity = US_STACKBUF_SIZE;
281 fArray = fStackBuffer;
282 fFlags = kShortString;
283 } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
284 setToBogus();
285 } else if(buffLength == -1) {
286 // fLength = u_strlen(buff); but do not look beyond buffCapacity
287 const UChar *p = buff, *limit = buff + buffCapacity;
288 while(p != limit && *p != 0) {
289 ++p;
290 }
291 fLength = (int32_t)(p - buff);
292 }
293 }
294
UnicodeString(const char * src,int32_t length,EInvariant)295 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
296 : fLength(0),
297 fCapacity(US_STACKBUF_SIZE),
298 fArray(fStackBuffer),
299 fFlags(kShortString)
300 {
301 if(src==NULL) {
302 // treat as an empty string
303 } else {
304 if(length<0) {
305 length=(int32_t)uprv_strlen(src);
306 }
307 if(cloneArrayIfNeeded(length, length, FALSE)) {
308 u_charsToUChars(src, getArrayStart(), length);
309 fLength = length;
310 } else {
311 setToBogus();
312 }
313 }
314 }
315
UnicodeString(const UnicodeString & that)316 UnicodeString::UnicodeString(const UnicodeString& that)
317 : Replaceable(),
318 fLength(0),
319 fCapacity(US_STACKBUF_SIZE),
320 fArray(fStackBuffer),
321 fFlags(kShortString)
322 {
323 copyFrom(that);
324 }
325
UnicodeString(const UnicodeString & that,int32_t srcStart)326 UnicodeString::UnicodeString(const UnicodeString& that,
327 int32_t srcStart)
328 : Replaceable(),
329 fLength(0),
330 fCapacity(US_STACKBUF_SIZE),
331 fArray(fStackBuffer),
332 fFlags(kShortString)
333 {
334 setTo(that, srcStart);
335 }
336
UnicodeString(const UnicodeString & that,int32_t srcStart,int32_t srcLength)337 UnicodeString::UnicodeString(const UnicodeString& that,
338 int32_t srcStart,
339 int32_t srcLength)
340 : Replaceable(),
341 fLength(0),
342 fCapacity(US_STACKBUF_SIZE),
343 fArray(fStackBuffer),
344 fFlags(kShortString)
345 {
346 setTo(that, srcStart, srcLength);
347 }
348
349 // Replaceable base class clone() default implementation, does not clone
350 Replaceable *
clone() const351 Replaceable::clone() const {
352 return NULL;
353 }
354
355 // UnicodeString overrides clone() with a real implementation
356 Replaceable *
clone() const357 UnicodeString::clone() const {
358 return new UnicodeString(*this);
359 }
360
361 //========================================
362 // array allocation
363 //========================================
364
365 UBool
allocate(int32_t capacity)366 UnicodeString::allocate(int32_t capacity) {
367 if(capacity <= US_STACKBUF_SIZE) {
368 fArray = fStackBuffer;
369 fCapacity = US_STACKBUF_SIZE;
370 fFlags = kShortString;
371 } else {
372 // count bytes for the refCounter and the string capacity, and
373 // round up to a multiple of 16; then divide by 4 and allocate int32_t's
374 // to be safely aligned for the refCount
375 int32_t words = (int32_t)(((sizeof(int32_t) + capacity * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
376 int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
377 if(array != 0) {
378 // set initial refCount and point behind the refCount
379 *array++ = 1;
380
381 // have fArray point to the first UChar
382 fArray = (UChar *)array;
383 fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
384 fFlags = kLongString;
385 } else {
386 fLength = 0;
387 fCapacity = 0;
388 fFlags = kIsBogus;
389 return FALSE;
390 }
391 }
392 return TRUE;
393 }
394
395 //========================================
396 // Destructor
397 //========================================
~UnicodeString()398 UnicodeString::~UnicodeString()
399 {
400 releaseArray();
401 }
402
403
404 //========================================
405 // Assignment
406 //========================================
407
408 UnicodeString &
operator =(const UnicodeString & src)409 UnicodeString::operator=(const UnicodeString &src) {
410 return copyFrom(src);
411 }
412
413 UnicodeString &
fastCopyFrom(const UnicodeString & src)414 UnicodeString::fastCopyFrom(const UnicodeString &src) {
415 return copyFrom(src, TRUE);
416 }
417
418 UnicodeString &
copyFrom(const UnicodeString & src,UBool fastCopy)419 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
420 // if assigning to ourselves, do nothing
421 if(this == 0 || this == &src) {
422 return *this;
423 }
424
425 // is the right side bogus?
426 if(&src == 0 || src.isBogus()) {
427 setToBogus();
428 return *this;
429 }
430
431 // delete the current contents
432 releaseArray();
433
434 // we always copy the length
435 fLength = src.fLength;
436 if(fLength == 0) {
437 // empty string - use the stack buffer
438 fArray = fStackBuffer;
439 fCapacity = US_STACKBUF_SIZE;
440 fFlags = kShortString;
441 return *this;
442 }
443
444 // fLength>0 and not an "open" src.getBuffer(minCapacity)
445 switch(src.fFlags) {
446 case kShortString:
447 // short string using the stack buffer, do the same
448 fArray = fStackBuffer;
449 fCapacity = US_STACKBUF_SIZE;
450 fFlags = kShortString;
451 uprv_memcpy(fStackBuffer, src.fArray, fLength * U_SIZEOF_UCHAR);
452 break;
453 case kLongString:
454 // src uses a refCounted string buffer, use that buffer with refCount
455 // src is const, use a cast - we don't really change it
456 ((UnicodeString &)src).addRef();
457 // copy all fields, share the reference-counted buffer
458 fArray = src.fArray;
459 fCapacity = src.fCapacity;
460 fFlags = src.fFlags;
461 break;
462 case kReadonlyAlias:
463 if(fastCopy) {
464 // src is a readonly alias, do the same
465 // -> maintain the readonly alias as such
466 fArray = src.fArray;
467 fCapacity = src.fCapacity;
468 fFlags = src.fFlags;
469 break;
470 }
471 // else if(!fastCopy) fall through to case kWritableAlias
472 // -> allocate a new buffer and copy the contents
473 case kWritableAlias:
474 // src is a writable alias; we make a copy of that instead
475 if(allocate(fLength)) {
476 uprv_memcpy(fArray, src.fArray, fLength * U_SIZEOF_UCHAR);
477 break;
478 }
479 // if there is not enough memory, then fall through to setting to bogus
480 default:
481 // if src is bogus, set ourselves to bogus
482 // do not call setToBogus() here because fArray and fFlags are not consistent here
483 fArray = 0;
484 fLength = 0;
485 fCapacity = 0;
486 fFlags = kIsBogus;
487 break;
488 }
489
490 return *this;
491 }
492
493 //========================================
494 // Miscellaneous operations
495 //========================================
496
unescape() const497 UnicodeString UnicodeString::unescape() const {
498 UnicodeString result;
499 for (int32_t i=0; i<length(); ) {
500 UChar32 c = charAt(i++);
501 if (c == 0x005C /*'\\'*/) {
502 c = unescapeAt(i); // advances i
503 if (c == (UChar32)0xFFFFFFFF) {
504 result.remove(); // return empty string
505 break; // invalid escape sequence
506 }
507 }
508 result.append(c);
509 }
510 return result;
511 }
512
unescapeAt(int32_t & offset) const513 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
514 return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
515 }
516
517 //========================================
518 // Read-only implementation
519 //========================================
520 int8_t
doCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const521 UnicodeString::doCompare( int32_t start,
522 int32_t length,
523 const UChar *srcChars,
524 int32_t srcStart,
525 int32_t srcLength) const
526 {
527 // compare illegal string values
528 // treat const UChar *srcChars==NULL as an empty string
529 if(isBogus()) {
530 return -1;
531 }
532
533 // pin indices to legal values
534 pinIndices(start, length);
535
536 if(srcChars == NULL) {
537 srcStart = srcLength = 0;
538 }
539
540 // get the correct pointer
541 const UChar *chars = getArrayStart();
542
543 chars += start;
544 srcChars += srcStart;
545
546 int32_t minLength;
547 int8_t lengthResult;
548
549 // get the srcLength if necessary
550 if(srcLength < 0) {
551 srcLength = u_strlen(srcChars + srcStart);
552 }
553
554 // are we comparing different lengths?
555 if(length != srcLength) {
556 if(length < srcLength) {
557 minLength = length;
558 lengthResult = -1;
559 } else {
560 minLength = srcLength;
561 lengthResult = 1;
562 }
563 } else {
564 minLength = length;
565 lengthResult = 0;
566 }
567
568 /*
569 * note that uprv_memcmp() returns an int but we return an int8_t;
570 * we need to take care not to truncate the result -
571 * one way to do this is to right-shift the value to
572 * move the sign bit into the lower 8 bits and making sure that this
573 * does not become 0 itself
574 */
575
576 if(minLength > 0 && chars != srcChars) {
577 int32_t result;
578
579 # if U_IS_BIG_ENDIAN
580 // big-endian: byte comparison works
581 result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
582 if(result != 0) {
583 return (int8_t)(result >> 15 | 1);
584 }
585 # else
586 // little-endian: compare UChar units
587 do {
588 result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
589 if(result != 0) {
590 return (int8_t)(result >> 15 | 1);
591 }
592 } while(--minLength > 0);
593 # endif
594 }
595 return lengthResult;
596 }
597
598 /* String compare in code point order - doCompare() compares in code unit order. */
599 int8_t
doCompareCodePointOrder(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const600 UnicodeString::doCompareCodePointOrder(int32_t start,
601 int32_t length,
602 const UChar *srcChars,
603 int32_t srcStart,
604 int32_t srcLength) const
605 {
606 // compare illegal string values
607 // treat const UChar *srcChars==NULL as an empty string
608 if(isBogus()) {
609 return -1;
610 }
611
612 // pin indices to legal values
613 pinIndices(start, length);
614
615 if(srcChars == NULL) {
616 srcStart = srcLength = 0;
617 }
618
619 int32_t diff = uprv_strCompare(fArray + start, length, srcChars + srcStart, srcLength, FALSE, TRUE);
620 /* translate the 32-bit result into an 8-bit one */
621 if(diff!=0) {
622 return (int8_t)(diff >> 15 | 1);
623 } else {
624 return 0;
625 }
626 }
627
628 int32_t
getLength() const629 UnicodeString::getLength() const {
630 return length();
631 }
632
633 UChar
getCharAt(int32_t offset) const634 UnicodeString::getCharAt(int32_t offset) const {
635 return charAt(offset);
636 }
637
638 UChar32
getChar32At(int32_t offset) const639 UnicodeString::getChar32At(int32_t offset) const {
640 return char32At(offset);
641 }
642
643 int32_t
countChar32(int32_t start,int32_t length) const644 UnicodeString::countChar32(int32_t start, int32_t length) const {
645 pinIndices(start, length);
646 // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
647 return u_countChar32(fArray+start, length);
648 }
649
650 UBool
hasMoreChar32Than(int32_t start,int32_t length,int32_t number) const651 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
652 pinIndices(start, length);
653 // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
654 return u_strHasMoreChar32Than(fArray+start, length, number);
655 }
656
657 int32_t
moveIndex32(int32_t index,int32_t delta) const658 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
659 // pin index
660 if(index<0) {
661 index=0;
662 } else if(index>fLength) {
663 index=fLength;
664 }
665
666 if(delta>0) {
667 UTF_FWD_N(fArray, index, fLength, delta);
668 } else {
669 UTF_BACK_N(fArray, 0, index, -delta);
670 }
671
672 return index;
673 }
674
675 void
doExtract(int32_t start,int32_t length,UChar * dst,int32_t dstStart) const676 UnicodeString::doExtract(int32_t start,
677 int32_t length,
678 UChar *dst,
679 int32_t dstStart) const
680 {
681 // pin indices to legal values
682 pinIndices(start, length);
683
684 // do not copy anything if we alias dst itself
685 if(fArray + start != dst + dstStart) {
686 us_arrayCopy(getArrayStart(), start, dst, dstStart, length);
687 }
688 }
689
690 int32_t
extract(UChar * dest,int32_t destCapacity,UErrorCode & errorCode) const691 UnicodeString::extract(UChar *dest, int32_t destCapacity,
692 UErrorCode &errorCode) const {
693 if(U_SUCCESS(errorCode)) {
694 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
695 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
696 } else {
697 if(fLength>0 && fLength<=destCapacity && fArray!=dest) {
698 uprv_memcpy(dest, fArray, fLength*U_SIZEOF_UCHAR);
699 }
700 return u_terminateUChars(dest, destCapacity, fLength, &errorCode);
701 }
702 }
703
704 return fLength;
705 }
706
707 int32_t
extract(int32_t start,int32_t length,char * target,int32_t targetCapacity,enum EInvariant) const708 UnicodeString::extract(int32_t start,
709 int32_t length,
710 char *target,
711 int32_t targetCapacity,
712 enum EInvariant) const
713 {
714 // if the arguments are illegal, then do nothing
715 if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
716 return 0;
717 }
718
719 // pin the indices to legal values
720 pinIndices(start, length);
721
722 if(length <= targetCapacity) {
723 u_UCharsToChars(getArrayStart() + start, target, length);
724 }
725 UErrorCode status = U_ZERO_ERROR;
726 return u_terminateChars(target, targetCapacity, length, &status);
727 }
728
729 void
extractBetween(int32_t start,int32_t limit,UnicodeString & target) const730 UnicodeString::extractBetween(int32_t start,
731 int32_t limit,
732 UnicodeString& target) const {
733 pinIndex(start);
734 pinIndex(limit);
735 doExtract(start, limit - start, target);
736 }
737
738 int32_t
indexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const739 UnicodeString::indexOf(const UChar *srcChars,
740 int32_t srcStart,
741 int32_t srcLength,
742 int32_t start,
743 int32_t length) const
744 {
745 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
746 return -1;
747 }
748
749 // UnicodeString does not find empty substrings
750 if(srcLength < 0 && srcChars[srcStart] == 0) {
751 return -1;
752 }
753
754 // get the indices within bounds
755 pinIndices(start, length);
756
757 // find the first occurrence of the substring
758 const UChar *match = u_strFindFirst(fArray + start, length, srcChars + srcStart, srcLength);
759 if(match == NULL) {
760 return -1;
761 } else {
762 return (int32_t)(match - fArray);
763 }
764 }
765
766 int32_t
doIndexOf(UChar c,int32_t start,int32_t length) const767 UnicodeString::doIndexOf(UChar c,
768 int32_t start,
769 int32_t length) const
770 {
771 // pin indices
772 pinIndices(start, length);
773
774 // find the first occurrence of c
775 const UChar *match = u_memchr(fArray + start, c, length);
776 if(match == NULL) {
777 return -1;
778 } else {
779 return (int32_t)(match - fArray);
780 }
781 }
782
783 int32_t
doIndexOf(UChar32 c,int32_t start,int32_t length) const784 UnicodeString::doIndexOf(UChar32 c,
785 int32_t start,
786 int32_t length) const {
787 // pin indices
788 pinIndices(start, length);
789
790 // find the first occurrence of c
791 const UChar *match = u_memchr32(fArray + start, c, length);
792 if(match == NULL) {
793 return -1;
794 } else {
795 return (int32_t)(match - fArray);
796 }
797 }
798
799 int32_t
lastIndexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const800 UnicodeString::lastIndexOf(const UChar *srcChars,
801 int32_t srcStart,
802 int32_t srcLength,
803 int32_t start,
804 int32_t length) const
805 {
806 if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
807 return -1;
808 }
809
810 // UnicodeString does not find empty substrings
811 if(srcLength < 0 && srcChars[srcStart] == 0) {
812 return -1;
813 }
814
815 // get the indices within bounds
816 pinIndices(start, length);
817
818 // find the last occurrence of the substring
819 const UChar *match = u_strFindLast(fArray + start, length, srcChars + srcStart, srcLength);
820 if(match == NULL) {
821 return -1;
822 } else {
823 return (int32_t)(match - fArray);
824 }
825 }
826
827 int32_t
doLastIndexOf(UChar c,int32_t start,int32_t length) const828 UnicodeString::doLastIndexOf(UChar c,
829 int32_t start,
830 int32_t length) const
831 {
832 if(isBogus()) {
833 return -1;
834 }
835
836 // pin indices
837 pinIndices(start, length);
838
839 // find the last occurrence of c
840 const UChar *match = u_memrchr(fArray + start, c, length);
841 if(match == NULL) {
842 return -1;
843 } else {
844 return (int32_t)(match - fArray);
845 }
846 }
847
848 int32_t
doLastIndexOf(UChar32 c,int32_t start,int32_t length) const849 UnicodeString::doLastIndexOf(UChar32 c,
850 int32_t start,
851 int32_t length) const {
852 // pin indices
853 pinIndices(start, length);
854
855 // find the last occurrence of c
856 const UChar *match = u_memrchr32(fArray + start, c, length);
857 if(match == NULL) {
858 return -1;
859 } else {
860 return (int32_t)(match - fArray);
861 }
862 }
863
864 //========================================
865 // Write implementation
866 //========================================
867
868 UnicodeString&
findAndReplace(int32_t start,int32_t length,const UnicodeString & oldText,int32_t oldStart,int32_t oldLength,const UnicodeString & newText,int32_t newStart,int32_t newLength)869 UnicodeString::findAndReplace(int32_t start,
870 int32_t length,
871 const UnicodeString& oldText,
872 int32_t oldStart,
873 int32_t oldLength,
874 const UnicodeString& newText,
875 int32_t newStart,
876 int32_t newLength)
877 {
878 if(isBogus() || oldText.isBogus() || newText.isBogus()) {
879 return *this;
880 }
881
882 pinIndices(start, length);
883 oldText.pinIndices(oldStart, oldLength);
884 newText.pinIndices(newStart, newLength);
885
886 if(oldLength == 0) {
887 return *this;
888 }
889
890 while(length > 0 && length >= oldLength) {
891 int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
892 if(pos < 0) {
893 // no more oldText's here: done
894 break;
895 } else {
896 // we found oldText, replace it by newText and go beyond it
897 replace(pos, oldLength, newText, newStart, newLength);
898 length -= pos + oldLength - start;
899 start = pos + newLength;
900 }
901 }
902
903 return *this;
904 }
905
906
907 void
setToBogus()908 UnicodeString::setToBogus()
909 {
910 releaseArray();
911
912 fArray = 0;
913 fCapacity = fLength = 0;
914 fFlags = kIsBogus;
915 }
916
917 // turn a bogus string into an empty one
918 void
unBogus()919 UnicodeString::unBogus() {
920 if(fFlags & kIsBogus) {
921 fArray = fStackBuffer;
922 fLength = 0;
923 fCapacity = US_STACKBUF_SIZE;
924 fFlags = kShortString;
925 }
926 }
927
928 // setTo() analogous to the readonly-aliasing constructor with the same signature
929 UnicodeString &
setTo(UBool isTerminated,const UChar * text,int32_t textLength)930 UnicodeString::setTo(UBool isTerminated,
931 const UChar *text,
932 int32_t textLength)
933 {
934 if(fFlags & kOpenGetBuffer) {
935 // do not modify a string that has an "open" getBuffer(minCapacity)
936 return *this;
937 }
938
939 if(text == NULL) {
940 // treat as an empty string, do not alias
941 releaseArray();
942 fLength = 0;
943 fCapacity = US_STACKBUF_SIZE;
944 fArray = fStackBuffer;
945 fFlags = kShortString;
946 return *this;
947 }
948
949 if( textLength < -1 ||
950 (textLength == -1 && !isTerminated) ||
951 (textLength >= 0 && isTerminated && text[textLength] != 0)
952 ) {
953 setToBogus();
954 return *this;
955 }
956
957 releaseArray();
958
959 fArray = (UChar *)text;
960 if(textLength != -1) {
961 fLength = textLength;
962 fCapacity = isTerminated ? fLength + 1 : fLength;
963 } else {
964 // text is terminated, or else it would have failed the above test
965 fLength = u_strlen(text);
966 fCapacity = fLength + 1;
967 }
968
969 fFlags = kReadonlyAlias;
970 return *this;
971 }
972
973 // setTo() analogous to the writable-aliasing constructor with the same signature
974 UnicodeString &
setTo(UChar * buffer,int32_t buffLength,int32_t buffCapacity)975 UnicodeString::setTo(UChar *buffer,
976 int32_t buffLength,
977 int32_t buffCapacity) {
978 if(fFlags & kOpenGetBuffer) {
979 // do not modify a string that has an "open" getBuffer(minCapacity)
980 return *this;
981 }
982
983 if(buffer == NULL) {
984 // treat as an empty string, do not alias
985 releaseArray();
986 fLength = 0;
987 fCapacity = US_STACKBUF_SIZE;
988 fArray = fStackBuffer;
989 fFlags = kShortString;
990 return *this;
991 }
992
993 if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
994 setToBogus();
995 return *this;
996 } else if(buffLength == -1) {
997 // buffLength = u_strlen(buff); but do not look beyond buffCapacity
998 const UChar *p = buffer, *limit = buffer + buffCapacity;
999 while(p != limit && *p != 0) {
1000 ++p;
1001 }
1002 buffLength = (int32_t)(p - buffer);
1003 }
1004
1005 releaseArray();
1006
1007 fArray = buffer;
1008 fLength = buffLength;
1009 fCapacity = buffCapacity;
1010 fFlags = kWritableAlias;
1011 return *this;
1012 }
1013
1014 UnicodeString&
setCharAt(int32_t offset,UChar c)1015 UnicodeString::setCharAt(int32_t offset,
1016 UChar c)
1017 {
1018 if(cloneArrayIfNeeded() && fLength > 0) {
1019 if(offset < 0) {
1020 offset = 0;
1021 } else if(offset >= fLength) {
1022 offset = fLength - 1;
1023 }
1024
1025 fArray[offset] = c;
1026 }
1027 return *this;
1028 }
1029
1030 UnicodeString&
doReplace(int32_t start,int32_t length,const UnicodeString & src,int32_t srcStart,int32_t srcLength)1031 UnicodeString::doReplace( int32_t start,
1032 int32_t length,
1033 const UnicodeString& src,
1034 int32_t srcStart,
1035 int32_t srcLength)
1036 {
1037 if(!src.isBogus()) {
1038 // pin the indices to legal values
1039 src.pinIndices(srcStart, srcLength);
1040
1041 // get the characters from src
1042 // and replace the range in ourselves with them
1043 return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1044 } else {
1045 // remove the range
1046 return doReplace(start, length, 0, 0, 0);
1047 }
1048 }
1049
1050 UnicodeString&
doReplace(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength)1051 UnicodeString::doReplace(int32_t start,
1052 int32_t length,
1053 const UChar *srcChars,
1054 int32_t srcStart,
1055 int32_t srcLength)
1056 {
1057 if(isBogus()) {
1058 return *this;
1059 }
1060
1061 if(srcChars == 0) {
1062 srcStart = srcLength = 0;
1063 } else if(srcLength < 0) {
1064 // get the srcLength if necessary
1065 srcLength = u_strlen(srcChars + srcStart);
1066 }
1067
1068 int32_t *bufferToDelete = 0;
1069
1070 // the following may change fArray but will not copy the current contents;
1071 // therefore we need to keep the current fArray
1072 UChar *oldArray = fArray;
1073 int32_t oldLength = fLength;
1074
1075 // pin the indices to legal values
1076 pinIndices(start, length);
1077
1078 // calculate the size of the string after the replace
1079 int32_t newSize = oldLength - length + srcLength;
1080
1081 // clone our array and allocate a bigger array if needed
1082 if(!cloneArrayIfNeeded(newSize, newSize + (newSize >> 2) + kGrowSize,
1083 FALSE, &bufferToDelete)
1084 ) {
1085 return *this;
1086 }
1087
1088 // now do the replace
1089
1090 if(fArray != oldArray) {
1091 // if fArray changed, then we need to copy everything except what will change
1092 us_arrayCopy(oldArray, 0, fArray, 0, start);
1093 us_arrayCopy(oldArray, start + length,
1094 fArray, start + srcLength,
1095 oldLength - (start + length));
1096 } else if(length != srcLength) {
1097 // fArray did not change; copy only the portion that isn't changing, leaving a hole
1098 us_arrayCopy(oldArray, start + length,
1099 fArray, start + srcLength,
1100 oldLength - (start + length));
1101 }
1102
1103 // now fill in the hole with the new string
1104 us_arrayCopy(srcChars, srcStart, getArrayStart(), start, srcLength);
1105
1106 fLength = newSize;
1107
1108 // delayed delete in case srcChars == fArray when we started, and
1109 // to keep oldArray alive for the above operations
1110 if (bufferToDelete) {
1111 uprv_free(bufferToDelete);
1112 }
1113
1114 return *this;
1115 }
1116
1117 /**
1118 * Replaceable API
1119 */
1120 void
handleReplaceBetween(int32_t start,int32_t limit,const UnicodeString & text)1121 UnicodeString::handleReplaceBetween(int32_t start,
1122 int32_t limit,
1123 const UnicodeString& text) {
1124 replaceBetween(start, limit, text);
1125 }
1126
1127 /**
1128 * Replaceable API
1129 */
1130 void
copy(int32_t start,int32_t limit,int32_t dest)1131 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1132 if (limit <= start) {
1133 return; // Nothing to do; avoid bogus malloc call
1134 }
1135 UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1136 extractBetween(start, limit, text, 0);
1137 insert(dest, text, 0, limit - start);
1138 uprv_free(text);
1139 }
1140
1141 /**
1142 * Replaceable API
1143 *
1144 * NOTE: This is for the Replaceable class. There is no rep.cpp,
1145 * so we implement this function here.
1146 */
hasMetaData() const1147 UBool Replaceable::hasMetaData() const {
1148 return TRUE;
1149 }
1150
1151 /**
1152 * Replaceable API
1153 */
hasMetaData() const1154 UBool UnicodeString::hasMetaData() const {
1155 return FALSE;
1156 }
1157
1158 UnicodeString&
doReverse(int32_t start,int32_t length)1159 UnicodeString::doReverse(int32_t start,
1160 int32_t length)
1161 {
1162 if(fLength <= 1 || !cloneArrayIfNeeded()) {
1163 return *this;
1164 }
1165
1166 // pin the indices to legal values
1167 pinIndices(start, length);
1168
1169 UChar *left = getArrayStart() + start;
1170 UChar *right = getArrayStart() + start + length;
1171 UChar swap;
1172 UBool hasSupplementary = FALSE;
1173
1174 while(left < --right) {
1175 hasSupplementary |= (UBool)UTF_IS_LEAD(swap = *left);
1176 hasSupplementary |= (UBool)UTF_IS_LEAD(*left++ = *right);
1177 *right = swap;
1178 }
1179
1180 /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1181 if(hasSupplementary) {
1182 UChar swap2;
1183
1184 left = getArrayStart() + start;
1185 right = getArrayStart() + start + length - 1; // -1 so that we can look at *(left+1) if left<right
1186 while(left < right) {
1187 if(UTF_IS_TRAIL(swap = *left) && UTF_IS_LEAD(swap2 = *(left + 1))) {
1188 *left++ = swap2;
1189 *left++ = swap;
1190 } else {
1191 ++left;
1192 }
1193 }
1194 }
1195
1196 return *this;
1197 }
1198
1199 UBool
padLeading(int32_t targetLength,UChar padChar)1200 UnicodeString::padLeading(int32_t targetLength,
1201 UChar padChar)
1202 {
1203 if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1204 return FALSE;
1205 } else {
1206 // move contents up by padding width
1207 int32_t start = targetLength - fLength;
1208 us_arrayCopy(fArray, 0, fArray, start, fLength);
1209
1210 // fill in padding character
1211 while(--start >= 0) {
1212 fArray[start] = padChar;
1213 }
1214 fLength = targetLength;
1215 return TRUE;
1216 }
1217 }
1218
1219 UBool
padTrailing(int32_t targetLength,UChar padChar)1220 UnicodeString::padTrailing(int32_t targetLength,
1221 UChar padChar)
1222 {
1223 if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1224 return FALSE;
1225 } else {
1226 // fill in padding character
1227 int32_t length = targetLength;
1228 while(--length >= fLength) {
1229 fArray[length] = padChar;
1230 }
1231 fLength = targetLength;
1232 return TRUE;
1233 }
1234 }
1235
1236 //========================================
1237 // Hashing
1238 //========================================
1239 int32_t
doHashCode() const1240 UnicodeString::doHashCode() const
1241 {
1242 /* Delegate hash computation to uhash. This makes UnicodeString
1243 * hashing consistent with UChar* hashing. */
1244 int32_t hashCode = uhash_hashUCharsN(getArrayStart(), fLength);
1245 if (hashCode == kInvalidHashCode) {
1246 hashCode = kEmptyHashCode;
1247 }
1248 return hashCode;
1249 }
1250
1251 //========================================
1252 // External Buffer
1253 //========================================
1254
1255 UChar *
getBuffer(int32_t minCapacity)1256 UnicodeString::getBuffer(int32_t minCapacity) {
1257 if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1258 fFlags|=kOpenGetBuffer;
1259 fLength=0;
1260 return fArray;
1261 } else {
1262 return 0;
1263 }
1264 }
1265
1266 void
releaseBuffer(int32_t newLength)1267 UnicodeString::releaseBuffer(int32_t newLength) {
1268 if(fFlags&kOpenGetBuffer && newLength>=-1) {
1269 // set the new fLength
1270 if(newLength==-1) {
1271 // the new length is the string length, capped by fCapacity
1272 const UChar *p=fArray, *limit=fArray+fCapacity;
1273 while(p<limit && *p!=0) {
1274 ++p;
1275 }
1276 fLength=(int32_t)(p-fArray);
1277 } else if(newLength<=fCapacity) {
1278 fLength=newLength;
1279 } else {
1280 fLength=fCapacity;
1281 }
1282 fFlags&=~kOpenGetBuffer;
1283 }
1284 }
1285
1286 //========================================
1287 // Miscellaneous
1288 //========================================
1289 UBool
cloneArrayIfNeeded(int32_t newCapacity,int32_t growCapacity,UBool doCopyArray,int32_t ** pBufferToDelete,UBool forceClone)1290 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1291 int32_t growCapacity,
1292 UBool doCopyArray,
1293 int32_t **pBufferToDelete,
1294 UBool forceClone) {
1295 // default parameters need to be static, therefore
1296 // the defaults are -1 to have convenience defaults
1297 if(newCapacity == -1) {
1298 newCapacity = fCapacity;
1299 }
1300
1301 // while a getBuffer(minCapacity) is "open",
1302 // prevent any modifications of the string by returning FALSE here
1303 // if the string is bogus, then only an assignment or similar can revive it
1304 if((fFlags&(kOpenGetBuffer|kIsBogus))!=0) {
1305 return FALSE;
1306 }
1307
1308 /*
1309 * We need to make a copy of the array if
1310 * the buffer is read-only, or
1311 * the buffer is refCounted (shared), and refCount>1, or
1312 * the buffer is too small.
1313 * Return FALSE if memory could not be allocated.
1314 */
1315 if(forceClone ||
1316 fFlags & kBufferIsReadonly ||
1317 fFlags & kRefCounted && refCount() > 1 ||
1318 newCapacity > fCapacity
1319 ) {
1320 // save old values
1321 UChar *array = fArray;
1322 uint16_t flags = fFlags;
1323
1324 // check growCapacity for default value and use of the stack buffer
1325 if(growCapacity == -1) {
1326 growCapacity = newCapacity;
1327 } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1328 growCapacity = US_STACKBUF_SIZE;
1329 }
1330
1331 // allocate a new array
1332 if(allocate(growCapacity) ||
1333 newCapacity < growCapacity && allocate(newCapacity)
1334 ) {
1335 if(doCopyArray) {
1336 // copy the contents
1337 // do not copy more than what fits - it may be smaller than before
1338 if(fCapacity < fLength) {
1339 fLength = fCapacity;
1340 }
1341 us_arrayCopy(array, 0, fArray, 0, fLength);
1342 } else {
1343 fLength = 0;
1344 }
1345
1346 // release the old array
1347 if(flags & kRefCounted) {
1348 // the array is refCounted; decrement and release if 0
1349 int32_t *pRefCount = ((int32_t *)array - 1);
1350 if(umtx_atomic_dec(pRefCount) == 0) {
1351 if(pBufferToDelete == 0) {
1352 uprv_free(pRefCount);
1353 } else {
1354 // the caller requested to delete it himself
1355 *pBufferToDelete = pRefCount;
1356 }
1357 }
1358 }
1359 } else {
1360 // not enough memory for growCapacity and not even for the smaller newCapacity
1361 // reset the old values for setToBogus() to release the array
1362 fArray = array;
1363 fFlags = flags;
1364 setToBogus();
1365 return FALSE;
1366 }
1367 }
1368 return TRUE;
1369 }
1370 U_NAMESPACE_END
1371
1372 #ifdef U_STATIC_IMPLEMENTATION
1373 /*
1374 This should never be called. It is defined here to make sure that the
1375 virtual vector deleting destructor is defined within unistr.cpp.
1376 The vector deleting destructor is already a part of UObject,
1377 but defining it here makes sure that it is included with this object file.
1378 This makes sure that static library dependencies are kept to a minimum.
1379 */
uprv_UnicodeStringDummy(void)1380 static void uprv_UnicodeStringDummy(void) {
1381 U_NAMESPACE_USE
1382 delete [] (new UnicodeString[2]);
1383 }
1384 #endif
1385
1386