• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 * Copyright (C) 1999-2012, International Business Machines Corporation and
4 * others. All Rights Reserved.
5 ******************************************************************************
6 *
7 * File unistr.cpp
8 *
9 * Modification History:
10 *
11 *   Date        Name        Description
12 *   09/25/98    stephen     Creation.
13 *   04/20/99    stephen     Overhauled per 4/16 code review.
14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
16 *                           Replaceable.
17 *   06/25/01    grhoten     Removed the dependency on iostream
18 ******************************************************************************
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/appendable.h"
23 #include "unicode/putil.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/utf.h"
29 #include "unicode/utf16.h"
30 #include "uelement.h"
31 #include "ustr_imp.h"
32 #include "umutex.h"
33 #include "uassert.h"
34 
35 #if 0
36 
37 #include <iostream>
38 using namespace std;
39 
40 //DEBUGGING
41 void
42 print(const UnicodeString& s,
43       const char *name)
44 {
45   UChar c;
46   cout << name << ":|";
47   for(int i = 0; i < s.length(); ++i) {
48     c = s[i];
49     if(c>= 0x007E || c < 0x0020)
50       cout << "[0x" << hex << s[i] << "]";
51     else
52       cout << (char) s[i];
53   }
54   cout << '|' << endl;
55 }
56 
57 void
58 print(const UChar *s,
59       int32_t len,
60       const char *name)
61 {
62   UChar c;
63   cout << name << ":|";
64   for(int i = 0; i < len; ++i) {
65     c = s[i];
66     if(c>= 0x007E || c < 0x0020)
67       cout << "[0x" << hex << s[i] << "]";
68     else
69       cout << (char) s[i];
70   }
71   cout << '|' << endl;
72 }
73 // END DEBUGGING
74 #endif
75 
76 // Local function definitions for now
77 
78 // need to copy areas that may overlap
79 static
80 inline void
us_arrayCopy(const UChar * src,int32_t srcStart,UChar * dst,int32_t dstStart,int32_t count)81 us_arrayCopy(const UChar *src, int32_t srcStart,
82          UChar *dst, int32_t dstStart, int32_t count)
83 {
84   if(count>0) {
85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
86   }
87 }
88 
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
90 U_CDECL_BEGIN
91 static UChar U_CALLCONV
UnicodeString_charAt(int32_t offset,void * context)92 UnicodeString_charAt(int32_t offset, void *context) {
93     return ((icu::UnicodeString*) context)->charAt(offset);
94 }
95 U_CDECL_END
96 
97 U_NAMESPACE_BEGIN
98 
99 /* The Replaceable virtual destructor can't be defined in the header
100    due to how AIX works with multiple definitions of virtual functions.
101 */
~Replaceable()102 Replaceable::~Replaceable() {}
Replaceable()103 Replaceable::Replaceable() {}
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
105 
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
108     return
109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
110             append(s1).
111                 append(s2);
112 }
113 
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 //                               have a chance to automatically inline.
117 //========================================
118 
119 void
addRef()120 UnicodeString::addRef()
121 {  umtx_atomic_inc((int32_t *)fUnion.fFields.fArray - 1);}
122 
123 int32_t
removeRef()124 UnicodeString::removeRef()
125 { return umtx_atomic_dec((int32_t *)fUnion.fFields.fArray - 1);}
126 
127 int32_t
refCount() const128 UnicodeString::refCount() const
129 {
130     umtx_lock(NULL);
131     // Note: without the lock to force a memory barrier, we might see a very
132     //       stale value on some multi-processor systems.
133     int32_t  count = *((int32_t *)fUnion.fFields.fArray - 1);
134     umtx_unlock(NULL);
135     return count;
136  }
137 
138 void
releaseArray()139 UnicodeString::releaseArray() {
140   if((fFlags & kRefCounted) && removeRef() == 0) {
141     uprv_free((int32_t *)fUnion.fFields.fArray - 1);
142   }
143 }
144 
145 
146 
147 //========================================
148 // Constructors
149 //========================================
UnicodeString()150 UnicodeString::UnicodeString()
151   : fShortLength(0),
152     fFlags(kShortString)
153 {}
154 
UnicodeString(int32_t capacity,UChar32 c,int32_t count)155 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
156   : fShortLength(0),
157     fFlags(0)
158 {
159   if(count <= 0 || (uint32_t)c > 0x10ffff) {
160     // just allocate and do not do anything else
161     allocate(capacity);
162   } else {
163     // count > 0, allocate and fill the new string with count c's
164     int32_t unitCount = U16_LENGTH(c), length = count * unitCount;
165     if(capacity < length) {
166       capacity = length;
167     }
168     if(allocate(capacity)) {
169       UChar *array = getArrayStart();
170       int32_t i = 0;
171 
172       // fill the new string with c
173       if(unitCount == 1) {
174         // fill with length UChars
175         while(i < length) {
176           array[i++] = (UChar)c;
177         }
178       } else {
179         // get the code units for c
180         UChar units[U16_MAX_LENGTH];
181         U16_APPEND_UNSAFE(units, i, c);
182 
183         // now it must be i==unitCount
184         i = 0;
185 
186         // for Unicode, unitCount can only be 1, 2, 3, or 4
187         // 1 is handled above
188         while(i < length) {
189           int32_t unitIdx = 0;
190           while(unitIdx < unitCount) {
191             array[i++]=units[unitIdx++];
192           }
193         }
194       }
195     }
196     setLength(length);
197   }
198 }
199 
UnicodeString(UChar ch)200 UnicodeString::UnicodeString(UChar ch)
201   : fShortLength(1),
202     fFlags(kShortString)
203 {
204   fUnion.fStackBuffer[0] = ch;
205 }
206 
UnicodeString(UChar32 ch)207 UnicodeString::UnicodeString(UChar32 ch)
208   : fShortLength(0),
209     fFlags(kShortString)
210 {
211   int32_t i = 0;
212   UBool isError = FALSE;
213   U16_APPEND(fUnion.fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
214   // We test isError so that the compiler does not complain that we don't.
215   // If isError then i==0 which is what we want anyway.
216   if(!isError) {
217     fShortLength = (int8_t)i;
218   }
219 }
220 
UnicodeString(const UChar * text)221 UnicodeString::UnicodeString(const UChar *text)
222   : fShortLength(0),
223     fFlags(kShortString)
224 {
225   doReplace(0, 0, text, 0, -1);
226 }
227 
UnicodeString(const UChar * text,int32_t textLength)228 UnicodeString::UnicodeString(const UChar *text,
229                              int32_t textLength)
230   : fShortLength(0),
231     fFlags(kShortString)
232 {
233   doReplace(0, 0, text, 0, textLength);
234 }
235 
UnicodeString(UBool isTerminated,const UChar * text,int32_t textLength)236 UnicodeString::UnicodeString(UBool isTerminated,
237                              const UChar *text,
238                              int32_t textLength)
239   : fShortLength(0),
240     fFlags(kReadonlyAlias)
241 {
242   if(text == NULL) {
243     // treat as an empty string, do not alias
244     setToEmpty();
245   } else if(textLength < -1 ||
246             (textLength == -1 && !isTerminated) ||
247             (textLength >= 0 && isTerminated && text[textLength] != 0)
248   ) {
249     setToBogus();
250   } else {
251     if(textLength == -1) {
252       // text is terminated, or else it would have failed the above test
253       textLength = u_strlen(text);
254     }
255     setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
256   }
257 }
258 
UnicodeString(UChar * buff,int32_t buffLength,int32_t buffCapacity)259 UnicodeString::UnicodeString(UChar *buff,
260                              int32_t buffLength,
261                              int32_t buffCapacity)
262   : fShortLength(0),
263     fFlags(kWritableAlias)
264 {
265   if(buff == NULL) {
266     // treat as an empty string, do not alias
267     setToEmpty();
268   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
269     setToBogus();
270   } else {
271     if(buffLength == -1) {
272       // fLength = u_strlen(buff); but do not look beyond buffCapacity
273       const UChar *p = buff, *limit = buff + buffCapacity;
274       while(p != limit && *p != 0) {
275         ++p;
276       }
277       buffLength = (int32_t)(p - buff);
278     }
279     setArray(buff, buffLength, buffCapacity);
280   }
281 }
282 
UnicodeString(const char * src,int32_t length,EInvariant)283 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
284   : fShortLength(0),
285     fFlags(kShortString)
286 {
287   if(src==NULL) {
288     // treat as an empty string
289   } else {
290     if(length<0) {
291       length=(int32_t)uprv_strlen(src);
292     }
293     if(cloneArrayIfNeeded(length, length, FALSE)) {
294       u_charsToUChars(src, getArrayStart(), length);
295       setLength(length);
296     } else {
297       setToBogus();
298     }
299   }
300 }
301 
302 #if U_CHARSET_IS_UTF8
303 
UnicodeString(const char * codepageData)304 UnicodeString::UnicodeString(const char *codepageData)
305   : fShortLength(0),
306     fFlags(kShortString) {
307   if(codepageData != 0) {
308     setToUTF8(codepageData);
309   }
310 }
311 
UnicodeString(const char * codepageData,int32_t dataLength)312 UnicodeString::UnicodeString(const char *codepageData, int32_t dataLength)
313   : fShortLength(0),
314     fFlags(kShortString) {
315   // if there's nothing to convert, do nothing
316   if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
317     return;
318   }
319   if(dataLength == -1) {
320     dataLength = (int32_t)uprv_strlen(codepageData);
321   }
322   setToUTF8(StringPiece(codepageData, dataLength));
323 }
324 
325 // else see unistr_cnv.cpp
326 #endif
327 
UnicodeString(const UnicodeString & that)328 UnicodeString::UnicodeString(const UnicodeString& that)
329   : Replaceable(),
330     fShortLength(0),
331     fFlags(kShortString)
332 {
333   copyFrom(that);
334 }
335 
UnicodeString(const UnicodeString & that,int32_t srcStart)336 UnicodeString::UnicodeString(const UnicodeString& that,
337                              int32_t srcStart)
338   : Replaceable(),
339     fShortLength(0),
340     fFlags(kShortString)
341 {
342   setTo(that, srcStart);
343 }
344 
UnicodeString(const UnicodeString & that,int32_t srcStart,int32_t srcLength)345 UnicodeString::UnicodeString(const UnicodeString& that,
346                              int32_t srcStart,
347                              int32_t srcLength)
348   : Replaceable(),
349     fShortLength(0),
350     fFlags(kShortString)
351 {
352   setTo(that, srcStart, srcLength);
353 }
354 
355 // Replaceable base class clone() default implementation, does not clone
356 Replaceable *
clone() const357 Replaceable::clone() const {
358   return NULL;
359 }
360 
361 // UnicodeString overrides clone() with a real implementation
362 Replaceable *
clone() const363 UnicodeString::clone() const {
364   return new UnicodeString(*this);
365 }
366 
367 //========================================
368 // array allocation
369 //========================================
370 
371 UBool
allocate(int32_t capacity)372 UnicodeString::allocate(int32_t capacity) {
373   if(capacity <= US_STACKBUF_SIZE) {
374     fFlags = kShortString;
375   } else {
376     // count bytes for the refCounter and the string capacity, and
377     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
378     // to be safely aligned for the refCount
379     // the +1 is for the NUL terminator, to avoid reallocation in getTerminatedBuffer()
380     int32_t words = (int32_t)(((sizeof(int32_t) + (capacity + 1) * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
381     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
382     if(array != 0) {
383       // set initial refCount and point behind the refCount
384       *array++ = 1;
385 
386       // have fArray point to the first UChar
387       fUnion.fFields.fArray = (UChar *)array;
388       fUnion.fFields.fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
389       fFlags = kLongString;
390     } else {
391       fShortLength = 0;
392       fUnion.fFields.fArray = 0;
393       fUnion.fFields.fCapacity = 0;
394       fFlags = kIsBogus;
395       return FALSE;
396     }
397   }
398   return TRUE;
399 }
400 
401 //========================================
402 // Destructor
403 //========================================
~UnicodeString()404 UnicodeString::~UnicodeString()
405 {
406   releaseArray();
407 }
408 
409 //========================================
410 // Factory methods
411 //========================================
412 
fromUTF8(const StringPiece & utf8)413 UnicodeString UnicodeString::fromUTF8(const StringPiece &utf8) {
414   UnicodeString result;
415   result.setToUTF8(utf8);
416   return result;
417 }
418 
fromUTF32(const UChar32 * utf32,int32_t length)419 UnicodeString UnicodeString::fromUTF32(const UChar32 *utf32, int32_t length) {
420   UnicodeString result;
421   int32_t capacity;
422   // Most UTF-32 strings will be BMP-only and result in a same-length
423   // UTF-16 string. We overestimate the capacity just slightly,
424   // just in case there are a few supplementary characters.
425   if(length <= US_STACKBUF_SIZE) {
426     capacity = US_STACKBUF_SIZE;
427   } else {
428     capacity = length + (length >> 4) + 4;
429   }
430   do {
431     UChar *utf16 = result.getBuffer(capacity);
432     int32_t length16;
433     UErrorCode errorCode = U_ZERO_ERROR;
434     u_strFromUTF32WithSub(utf16, result.getCapacity(), &length16,
435         utf32, length,
436         0xfffd,  // Substitution character.
437         NULL,    // Don't care about number of substitutions.
438         &errorCode);
439     result.releaseBuffer(length16);
440     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
441       capacity = length16 + 1;  // +1 for the terminating NUL.
442       continue;
443     } else if(U_FAILURE(errorCode)) {
444       result.setToBogus();
445     }
446     break;
447   } while(TRUE);
448   return result;
449 }
450 
451 //========================================
452 // Assignment
453 //========================================
454 
455 UnicodeString &
operator =(const UnicodeString & src)456 UnicodeString::operator=(const UnicodeString &src) {
457   return copyFrom(src);
458 }
459 
460 UnicodeString &
fastCopyFrom(const UnicodeString & src)461 UnicodeString::fastCopyFrom(const UnicodeString &src) {
462   return copyFrom(src, TRUE);
463 }
464 
465 UnicodeString &
copyFrom(const UnicodeString & src,UBool fastCopy)466 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
467   // if assigning to ourselves, do nothing
468   if(this == 0 || this == &src) {
469     return *this;
470   }
471 
472   // is the right side bogus?
473   if(&src == 0 || src.isBogus()) {
474     setToBogus();
475     return *this;
476   }
477 
478   // delete the current contents
479   releaseArray();
480 
481   if(src.isEmpty()) {
482     // empty string - use the stack buffer
483     setToEmpty();
484     return *this;
485   }
486 
487   // we always copy the length
488   int32_t srcLength = src.length();
489   setLength(srcLength);
490 
491   // fLength>0 and not an "open" src.getBuffer(minCapacity)
492   switch(src.fFlags) {
493   case kShortString:
494     // short string using the stack buffer, do the same
495     fFlags = kShortString;
496     uprv_memcpy(fUnion.fStackBuffer, src.fUnion.fStackBuffer, srcLength * U_SIZEOF_UCHAR);
497     break;
498   case kLongString:
499     // src uses a refCounted string buffer, use that buffer with refCount
500     // src is const, use a cast - we don't really change it
501     ((UnicodeString &)src).addRef();
502     // copy all fields, share the reference-counted buffer
503     fUnion.fFields.fArray = src.fUnion.fFields.fArray;
504     fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
505     fFlags = src.fFlags;
506     break;
507   case kReadonlyAlias:
508     if(fastCopy) {
509       // src is a readonly alias, do the same
510       // -> maintain the readonly alias as such
511       fUnion.fFields.fArray = src.fUnion.fFields.fArray;
512       fUnion.fFields.fCapacity = src.fUnion.fFields.fCapacity;
513       fFlags = src.fFlags;
514       break;
515     }
516     // else if(!fastCopy) fall through to case kWritableAlias
517     // -> allocate a new buffer and copy the contents
518   case kWritableAlias:
519     // src is a writable alias; we make a copy of that instead
520     if(allocate(srcLength)) {
521       uprv_memcpy(getArrayStart(), src.getArrayStart(), srcLength * U_SIZEOF_UCHAR);
522       break;
523     }
524     // if there is not enough memory, then fall through to setting to bogus
525   default:
526     // if src is bogus, set ourselves to bogus
527     // do not call setToBogus() here because fArray and fFlags are not consistent here
528     fShortLength = 0;
529     fUnion.fFields.fArray = 0;
530     fUnion.fFields.fCapacity = 0;
531     fFlags = kIsBogus;
532     break;
533   }
534 
535   return *this;
536 }
537 
538 //========================================
539 // Miscellaneous operations
540 //========================================
541 
unescape() const542 UnicodeString UnicodeString::unescape() const {
543     UnicodeString result(length(), (UChar32)0, (int32_t)0); // construct with capacity
544     const UChar *array = getBuffer();
545     int32_t len = length();
546     int32_t prev = 0;
547     for (int32_t i=0;;) {
548         if (i == len) {
549             result.append(array, prev, len - prev);
550             break;
551         }
552         if (array[i++] == 0x5C /*'\\'*/) {
553             result.append(array, prev, (i - 1) - prev);
554             UChar32 c = unescapeAt(i); // advances i
555             if (c < 0) {
556                 result.remove(); // return empty string
557                 break; // invalid escape sequence
558             }
559             result.append(c);
560             prev = i;
561         }
562     }
563     return result;
564 }
565 
unescapeAt(int32_t & offset) const566 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
567     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
568 }
569 
570 //========================================
571 // Read-only implementation
572 //========================================
573 UBool
doEquals(const UnicodeString & text,int32_t len) const574 UnicodeString::doEquals(const UnicodeString &text, int32_t len) const {
575   // Requires: this & text not bogus and have same lengths.
576   // Byte-wise comparison works for equality regardless of endianness.
577   return uprv_memcmp(getArrayStart(), text.getArrayStart(), len * U_SIZEOF_UCHAR) == 0;
578 }
579 
580 int8_t
doCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const581 UnicodeString::doCompare( int32_t start,
582               int32_t length,
583               const UChar *srcChars,
584               int32_t srcStart,
585               int32_t srcLength) const
586 {
587   // compare illegal string values
588   if(isBogus()) {
589     return -1;
590   }
591 
592   // pin indices to legal values
593   pinIndices(start, length);
594 
595   if(srcChars == NULL) {
596     // treat const UChar *srcChars==NULL as an empty string
597     return length == 0 ? 0 : 1;
598   }
599 
600   // get the correct pointer
601   const UChar *chars = getArrayStart();
602 
603   chars += start;
604   srcChars += srcStart;
605 
606   int32_t minLength;
607   int8_t lengthResult;
608 
609   // get the srcLength if necessary
610   if(srcLength < 0) {
611     srcLength = u_strlen(srcChars + srcStart);
612   }
613 
614   // are we comparing different lengths?
615   if(length != srcLength) {
616     if(length < srcLength) {
617       minLength = length;
618       lengthResult = -1;
619     } else {
620       minLength = srcLength;
621       lengthResult = 1;
622     }
623   } else {
624     minLength = length;
625     lengthResult = 0;
626   }
627 
628   /*
629    * note that uprv_memcmp() returns an int but we return an int8_t;
630    * we need to take care not to truncate the result -
631    * one way to do this is to right-shift the value to
632    * move the sign bit into the lower 8 bits and making sure that this
633    * does not become 0 itself
634    */
635 
636   if(minLength > 0 && chars != srcChars) {
637     int32_t result;
638 
639 #   if U_IS_BIG_ENDIAN
640       // big-endian: byte comparison works
641       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
642       if(result != 0) {
643         return (int8_t)(result >> 15 | 1);
644       }
645 #   else
646       // little-endian: compare UChar units
647       do {
648         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
649         if(result != 0) {
650           return (int8_t)(result >> 15 | 1);
651         }
652       } while(--minLength > 0);
653 #   endif
654   }
655   return lengthResult;
656 }
657 
658 /* String compare in code point order - doCompare() compares in code unit order. */
659 int8_t
doCompareCodePointOrder(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const660 UnicodeString::doCompareCodePointOrder(int32_t start,
661                                        int32_t length,
662                                        const UChar *srcChars,
663                                        int32_t srcStart,
664                                        int32_t srcLength) const
665 {
666   // compare illegal string values
667   // treat const UChar *srcChars==NULL as an empty string
668   if(isBogus()) {
669     return -1;
670   }
671 
672   // pin indices to legal values
673   pinIndices(start, length);
674 
675   if(srcChars == NULL) {
676     srcStart = srcLength = 0;
677   }
678 
679   int32_t diff = uprv_strCompare(getArrayStart() + start, length, (srcChars!=NULL)?(srcChars + srcStart):NULL, srcLength, FALSE, TRUE);
680   /* translate the 32-bit result into an 8-bit one */
681   if(diff!=0) {
682     return (int8_t)(diff >> 15 | 1);
683   } else {
684     return 0;
685   }
686 }
687 
688 int32_t
getLength() const689 UnicodeString::getLength() const {
690     return length();
691 }
692 
693 UChar
getCharAt(int32_t offset) const694 UnicodeString::getCharAt(int32_t offset) const {
695   return charAt(offset);
696 }
697 
698 UChar32
getChar32At(int32_t offset) const699 UnicodeString::getChar32At(int32_t offset) const {
700   return char32At(offset);
701 }
702 
703 UChar32
char32At(int32_t offset) const704 UnicodeString::char32At(int32_t offset) const
705 {
706   int32_t len = length();
707   if((uint32_t)offset < (uint32_t)len) {
708     const UChar *array = getArrayStart();
709     UChar32 c;
710     U16_GET(array, 0, offset, len, c);
711     return c;
712   } else {
713     return kInvalidUChar;
714   }
715 }
716 
717 int32_t
getChar32Start(int32_t offset) const718 UnicodeString::getChar32Start(int32_t offset) const {
719   if((uint32_t)offset < (uint32_t)length()) {
720     const UChar *array = getArrayStart();
721     U16_SET_CP_START(array, 0, offset);
722     return offset;
723   } else {
724     return 0;
725   }
726 }
727 
728 int32_t
getChar32Limit(int32_t offset) const729 UnicodeString::getChar32Limit(int32_t offset) const {
730   int32_t len = length();
731   if((uint32_t)offset < (uint32_t)len) {
732     const UChar *array = getArrayStart();
733     U16_SET_CP_LIMIT(array, 0, offset, len);
734     return offset;
735   } else {
736     return len;
737   }
738 }
739 
740 int32_t
countChar32(int32_t start,int32_t length) const741 UnicodeString::countChar32(int32_t start, int32_t length) const {
742   pinIndices(start, length);
743   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
744   return u_countChar32(getArrayStart()+start, length);
745 }
746 
747 UBool
hasMoreChar32Than(int32_t start,int32_t length,int32_t number) const748 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
749   pinIndices(start, length);
750   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
751   return u_strHasMoreChar32Than(getArrayStart()+start, length, number);
752 }
753 
754 int32_t
moveIndex32(int32_t index,int32_t delta) const755 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
756   // pin index
757   int32_t len = length();
758   if(index<0) {
759     index=0;
760   } else if(index>len) {
761     index=len;
762   }
763 
764   const UChar *array = getArrayStart();
765   if(delta>0) {
766     U16_FWD_N(array, index, len, delta);
767   } else {
768     U16_BACK_N(array, 0, index, -delta);
769   }
770 
771   return index;
772 }
773 
774 void
doExtract(int32_t start,int32_t length,UChar * dst,int32_t dstStart) const775 UnicodeString::doExtract(int32_t start,
776              int32_t length,
777              UChar *dst,
778              int32_t dstStart) const
779 {
780   // pin indices to legal values
781   pinIndices(start, length);
782 
783   // do not copy anything if we alias dst itself
784   const UChar *array = getArrayStart();
785   if(array + start != dst + dstStart) {
786     us_arrayCopy(array, start, dst, dstStart, length);
787   }
788 }
789 
790 int32_t
extract(UChar * dest,int32_t destCapacity,UErrorCode & errorCode) const791 UnicodeString::extract(UChar *dest, int32_t destCapacity,
792                        UErrorCode &errorCode) const {
793   int32_t len = length();
794   if(U_SUCCESS(errorCode)) {
795     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
796       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
797     } else {
798       const UChar *array = getArrayStart();
799       if(len>0 && len<=destCapacity && array!=dest) {
800         uprv_memcpy(dest, array, len*U_SIZEOF_UCHAR);
801       }
802       return u_terminateUChars(dest, destCapacity, len, &errorCode);
803     }
804   }
805 
806   return len;
807 }
808 
809 int32_t
extract(int32_t start,int32_t length,char * target,int32_t targetCapacity,enum EInvariant) const810 UnicodeString::extract(int32_t start,
811                        int32_t length,
812                        char *target,
813                        int32_t targetCapacity,
814                        enum EInvariant) const
815 {
816   // if the arguments are illegal, then do nothing
817   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
818     return 0;
819   }
820 
821   // pin the indices to legal values
822   pinIndices(start, length);
823 
824   if(length <= targetCapacity) {
825     u_UCharsToChars(getArrayStart() + start, target, length);
826   }
827   UErrorCode status = U_ZERO_ERROR;
828   return u_terminateChars(target, targetCapacity, length, &status);
829 }
830 
831 UnicodeString
tempSubString(int32_t start,int32_t len) const832 UnicodeString::tempSubString(int32_t start, int32_t len) const {
833   pinIndices(start, len);
834   const UChar *array = getBuffer();  // not getArrayStart() to check kIsBogus & kOpenGetBuffer
835   if(array==NULL) {
836     array=fUnion.fStackBuffer;  // anything not NULL because that would make an empty string
837     len=-2;  // bogus result string
838   }
839   return UnicodeString(FALSE, array + start, len);
840 }
841 
842 int32_t
toUTF8(int32_t start,int32_t len,char * target,int32_t capacity) const843 UnicodeString::toUTF8(int32_t start, int32_t len,
844                       char *target, int32_t capacity) const {
845   pinIndices(start, len);
846   int32_t length8;
847   UErrorCode errorCode = U_ZERO_ERROR;
848   u_strToUTF8WithSub(target, capacity, &length8,
849                      getBuffer() + start, len,
850                      0xFFFD,  // Standard substitution character.
851                      NULL,    // Don't care about number of substitutions.
852                      &errorCode);
853   return length8;
854 }
855 
856 #if U_CHARSET_IS_UTF8
857 
858 int32_t
extract(int32_t start,int32_t len,char * target,uint32_t dstSize) const859 UnicodeString::extract(int32_t start, int32_t len,
860                        char *target, uint32_t dstSize) const {
861   // if the arguments are illegal, then do nothing
862   if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
863     return 0;
864   }
865   return toUTF8(start, len, target, dstSize <= 0x7fffffff ? (int32_t)dstSize : 0x7fffffff);
866 }
867 
868 // else see unistr_cnv.cpp
869 #endif
870 
871 void
extractBetween(int32_t start,int32_t limit,UnicodeString & target) const872 UnicodeString::extractBetween(int32_t start,
873                   int32_t limit,
874                   UnicodeString& target) const {
875   pinIndex(start);
876   pinIndex(limit);
877   doExtract(start, limit - start, target);
878 }
879 
880 // When converting from UTF-16 to UTF-8, the result will have at most 3 times
881 // as many bytes as the source has UChars.
882 // The "worst cases" are writing systems like Indic, Thai and CJK with
883 // 3:1 bytes:UChars.
884 void
toUTF8(ByteSink & sink) const885 UnicodeString::toUTF8(ByteSink &sink) const {
886   int32_t length16 = length();
887   if(length16 != 0) {
888     char stackBuffer[1024];
889     int32_t capacity = (int32_t)sizeof(stackBuffer);
890     UBool utf8IsOwned = FALSE;
891     char *utf8 = sink.GetAppendBuffer(length16 < capacity ? length16 : capacity,
892                                       3*length16,
893                                       stackBuffer, capacity,
894                                       &capacity);
895     int32_t length8 = 0;
896     UErrorCode errorCode = U_ZERO_ERROR;
897     u_strToUTF8WithSub(utf8, capacity, &length8,
898                        getBuffer(), length16,
899                        0xFFFD,  // Standard substitution character.
900                        NULL,    // Don't care about number of substitutions.
901                        &errorCode);
902     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
903       utf8 = (char *)uprv_malloc(length8);
904       if(utf8 != NULL) {
905         utf8IsOwned = TRUE;
906         errorCode = U_ZERO_ERROR;
907         u_strToUTF8WithSub(utf8, length8, &length8,
908                            getBuffer(), length16,
909                            0xFFFD,  // Standard substitution character.
910                            NULL,    // Don't care about number of substitutions.
911                            &errorCode);
912       } else {
913         errorCode = U_MEMORY_ALLOCATION_ERROR;
914       }
915     }
916     if(U_SUCCESS(errorCode)) {
917       sink.Append(utf8, length8);
918       sink.Flush();
919     }
920     if(utf8IsOwned) {
921       uprv_free(utf8);
922     }
923   }
924 }
925 
926 int32_t
toUTF32(UChar32 * utf32,int32_t capacity,UErrorCode & errorCode) const927 UnicodeString::toUTF32(UChar32 *utf32, int32_t capacity, UErrorCode &errorCode) const {
928   int32_t length32=0;
929   if(U_SUCCESS(errorCode)) {
930     // getBuffer() and u_strToUTF32WithSub() check for illegal arguments.
931     u_strToUTF32WithSub(utf32, capacity, &length32,
932         getBuffer(), length(),
933         0xfffd,  // Substitution character.
934         NULL,    // Don't care about number of substitutions.
935         &errorCode);
936   }
937   return length32;
938 }
939 
940 int32_t
indexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const941 UnicodeString::indexOf(const UChar *srcChars,
942                int32_t srcStart,
943                int32_t srcLength,
944                int32_t start,
945                int32_t length) const
946 {
947   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
948     return -1;
949   }
950 
951   // UnicodeString does not find empty substrings
952   if(srcLength < 0 && srcChars[srcStart] == 0) {
953     return -1;
954   }
955 
956   // get the indices within bounds
957   pinIndices(start, length);
958 
959   // find the first occurrence of the substring
960   const UChar *array = getArrayStart();
961   const UChar *match = u_strFindFirst(array + start, length, srcChars + srcStart, srcLength);
962   if(match == NULL) {
963     return -1;
964   } else {
965     return (int32_t)(match - array);
966   }
967 }
968 
969 int32_t
doIndexOf(UChar c,int32_t start,int32_t length) const970 UnicodeString::doIndexOf(UChar c,
971              int32_t start,
972              int32_t length) const
973 {
974   // pin indices
975   pinIndices(start, length);
976 
977   // find the first occurrence of c
978   const UChar *array = getArrayStart();
979   const UChar *match = u_memchr(array + start, c, length);
980   if(match == NULL) {
981     return -1;
982   } else {
983     return (int32_t)(match - array);
984   }
985 }
986 
987 int32_t
doIndexOf(UChar32 c,int32_t start,int32_t length) const988 UnicodeString::doIndexOf(UChar32 c,
989                          int32_t start,
990                          int32_t length) const {
991   // pin indices
992   pinIndices(start, length);
993 
994   // find the first occurrence of c
995   const UChar *array = getArrayStart();
996   const UChar *match = u_memchr32(array + start, c, length);
997   if(match == NULL) {
998     return -1;
999   } else {
1000     return (int32_t)(match - array);
1001   }
1002 }
1003 
1004 int32_t
lastIndexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const1005 UnicodeString::lastIndexOf(const UChar *srcChars,
1006                int32_t srcStart,
1007                int32_t srcLength,
1008                int32_t start,
1009                int32_t length) const
1010 {
1011   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
1012     return -1;
1013   }
1014 
1015   // UnicodeString does not find empty substrings
1016   if(srcLength < 0 && srcChars[srcStart] == 0) {
1017     return -1;
1018   }
1019 
1020   // get the indices within bounds
1021   pinIndices(start, length);
1022 
1023   // find the last occurrence of the substring
1024   const UChar *array = getArrayStart();
1025   const UChar *match = u_strFindLast(array + start, length, srcChars + srcStart, srcLength);
1026   if(match == NULL) {
1027     return -1;
1028   } else {
1029     return (int32_t)(match - array);
1030   }
1031 }
1032 
1033 int32_t
doLastIndexOf(UChar c,int32_t start,int32_t length) const1034 UnicodeString::doLastIndexOf(UChar c,
1035                  int32_t start,
1036                  int32_t length) const
1037 {
1038   if(isBogus()) {
1039     return -1;
1040   }
1041 
1042   // pin indices
1043   pinIndices(start, length);
1044 
1045   // find the last occurrence of c
1046   const UChar *array = getArrayStart();
1047   const UChar *match = u_memrchr(array + start, c, length);
1048   if(match == NULL) {
1049     return -1;
1050   } else {
1051     return (int32_t)(match - array);
1052   }
1053 }
1054 
1055 int32_t
doLastIndexOf(UChar32 c,int32_t start,int32_t length) const1056 UnicodeString::doLastIndexOf(UChar32 c,
1057                              int32_t start,
1058                              int32_t length) const {
1059   // pin indices
1060   pinIndices(start, length);
1061 
1062   // find the last occurrence of c
1063   const UChar *array = getArrayStart();
1064   const UChar *match = u_memrchr32(array + start, c, length);
1065   if(match == NULL) {
1066     return -1;
1067   } else {
1068     return (int32_t)(match - array);
1069   }
1070 }
1071 
1072 //========================================
1073 // Write implementation
1074 //========================================
1075 
1076 UnicodeString&
findAndReplace(int32_t start,int32_t length,const UnicodeString & oldText,int32_t oldStart,int32_t oldLength,const UnicodeString & newText,int32_t newStart,int32_t newLength)1077 UnicodeString::findAndReplace(int32_t start,
1078                   int32_t length,
1079                   const UnicodeString& oldText,
1080                   int32_t oldStart,
1081                   int32_t oldLength,
1082                   const UnicodeString& newText,
1083                   int32_t newStart,
1084                   int32_t newLength)
1085 {
1086   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
1087     return *this;
1088   }
1089 
1090   pinIndices(start, length);
1091   oldText.pinIndices(oldStart, oldLength);
1092   newText.pinIndices(newStart, newLength);
1093 
1094   if(oldLength == 0) {
1095     return *this;
1096   }
1097 
1098   while(length > 0 && length >= oldLength) {
1099     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
1100     if(pos < 0) {
1101       // no more oldText's here: done
1102       break;
1103     } else {
1104       // we found oldText, replace it by newText and go beyond it
1105       replace(pos, oldLength, newText, newStart, newLength);
1106       length -= pos + oldLength - start;
1107       start = pos + newLength;
1108     }
1109   }
1110 
1111   return *this;
1112 }
1113 
1114 
1115 void
setToBogus()1116 UnicodeString::setToBogus()
1117 {
1118   releaseArray();
1119 
1120   fShortLength = 0;
1121   fUnion.fFields.fArray = 0;
1122   fUnion.fFields.fCapacity = 0;
1123   fFlags = kIsBogus;
1124 }
1125 
1126 // turn a bogus string into an empty one
1127 void
unBogus()1128 UnicodeString::unBogus() {
1129   if(fFlags & kIsBogus) {
1130     setToEmpty();
1131   }
1132 }
1133 
1134 // setTo() analogous to the readonly-aliasing constructor with the same signature
1135 UnicodeString &
setTo(UBool isTerminated,const UChar * text,int32_t textLength)1136 UnicodeString::setTo(UBool isTerminated,
1137                      const UChar *text,
1138                      int32_t textLength)
1139 {
1140   if(fFlags & kOpenGetBuffer) {
1141     // do not modify a string that has an "open" getBuffer(minCapacity)
1142     return *this;
1143   }
1144 
1145   if(text == NULL) {
1146     // treat as an empty string, do not alias
1147     releaseArray();
1148     setToEmpty();
1149     return *this;
1150   }
1151 
1152   if( textLength < -1 ||
1153       (textLength == -1 && !isTerminated) ||
1154       (textLength >= 0 && isTerminated && text[textLength] != 0)
1155   ) {
1156     setToBogus();
1157     return *this;
1158   }
1159 
1160   releaseArray();
1161 
1162   if(textLength == -1) {
1163     // text is terminated, or else it would have failed the above test
1164     textLength = u_strlen(text);
1165   }
1166   setArray((UChar *)text, textLength, isTerminated ? textLength + 1 : textLength);
1167 
1168   fFlags = kReadonlyAlias;
1169   return *this;
1170 }
1171 
1172 // setTo() analogous to the writable-aliasing constructor with the same signature
1173 UnicodeString &
setTo(UChar * buffer,int32_t buffLength,int32_t buffCapacity)1174 UnicodeString::setTo(UChar *buffer,
1175                      int32_t buffLength,
1176                      int32_t buffCapacity) {
1177   if(fFlags & kOpenGetBuffer) {
1178     // do not modify a string that has an "open" getBuffer(minCapacity)
1179     return *this;
1180   }
1181 
1182   if(buffer == NULL) {
1183     // treat as an empty string, do not alias
1184     releaseArray();
1185     setToEmpty();
1186     return *this;
1187   }
1188 
1189   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
1190     setToBogus();
1191     return *this;
1192   } else if(buffLength == -1) {
1193     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
1194     const UChar *p = buffer, *limit = buffer + buffCapacity;
1195     while(p != limit && *p != 0) {
1196       ++p;
1197     }
1198     buffLength = (int32_t)(p - buffer);
1199   }
1200 
1201   releaseArray();
1202 
1203   setArray(buffer, buffLength, buffCapacity);
1204   fFlags = kWritableAlias;
1205   return *this;
1206 }
1207 
setToUTF8(const StringPiece & utf8)1208 UnicodeString &UnicodeString::setToUTF8(const StringPiece &utf8) {
1209   unBogus();
1210   int32_t length = utf8.length();
1211   int32_t capacity;
1212   // The UTF-16 string will be at most as long as the UTF-8 string.
1213   if(length <= US_STACKBUF_SIZE) {
1214     capacity = US_STACKBUF_SIZE;
1215   } else {
1216     capacity = length + 1;  // +1 for the terminating NUL.
1217   }
1218   UChar *utf16 = getBuffer(capacity);
1219   int32_t length16;
1220   UErrorCode errorCode = U_ZERO_ERROR;
1221   u_strFromUTF8WithSub(utf16, getCapacity(), &length16,
1222       utf8.data(), length,
1223       0xfffd,  // Substitution character.
1224       NULL,    // Don't care about number of substitutions.
1225       &errorCode);
1226   releaseBuffer(length16);
1227   if(U_FAILURE(errorCode)) {
1228     setToBogus();
1229   }
1230   return *this;
1231 }
1232 
1233 UnicodeString&
setCharAt(int32_t offset,UChar c)1234 UnicodeString::setCharAt(int32_t offset,
1235              UChar c)
1236 {
1237   int32_t len = length();
1238   if(cloneArrayIfNeeded() && len > 0) {
1239     if(offset < 0) {
1240       offset = 0;
1241     } else if(offset >= len) {
1242       offset = len - 1;
1243     }
1244 
1245     getArrayStart()[offset] = c;
1246   }
1247   return *this;
1248 }
1249 
1250 UnicodeString&
replace(int32_t start,int32_t _length,UChar32 srcChar)1251 UnicodeString::replace(int32_t start,
1252                int32_t _length,
1253                UChar32 srcChar) {
1254   UChar buffer[U16_MAX_LENGTH];
1255   int32_t count = 0;
1256   UBool isError = FALSE;
1257   U16_APPEND(buffer, count, U16_MAX_LENGTH, srcChar, isError);
1258   // We test isError so that the compiler does not complain that we don't.
1259   // If isError (srcChar is not a valid code point) then count==0 which means
1260   // we remove the source segment rather than replacing it with srcChar.
1261   return doReplace(start, _length, buffer, 0, isError ? 0 : count);
1262 }
1263 
1264 UnicodeString&
append(UChar32 srcChar)1265 UnicodeString::append(UChar32 srcChar) {
1266   UChar buffer[U16_MAX_LENGTH];
1267   int32_t _length = 0;
1268   UBool isError = FALSE;
1269   U16_APPEND(buffer, _length, U16_MAX_LENGTH, srcChar, isError);
1270   // We test isError so that the compiler does not complain that we don't.
1271   // If isError then _length==0 which turns the doReplace() into a no-op anyway.
1272   return isError ? *this : doReplace(length(), 0, buffer, 0, _length);
1273 }
1274 
1275 UnicodeString&
doReplace(int32_t start,int32_t length,const UnicodeString & src,int32_t srcStart,int32_t srcLength)1276 UnicodeString::doReplace( int32_t start,
1277               int32_t length,
1278               const UnicodeString& src,
1279               int32_t srcStart,
1280               int32_t srcLength)
1281 {
1282   if(!src.isBogus()) {
1283     // pin the indices to legal values
1284     src.pinIndices(srcStart, srcLength);
1285 
1286     // get the characters from src
1287     // and replace the range in ourselves with them
1288     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1289   } else {
1290     // remove the range
1291     return doReplace(start, length, 0, 0, 0);
1292   }
1293 }
1294 
1295 UnicodeString&
doReplace(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength)1296 UnicodeString::doReplace(int32_t start,
1297              int32_t length,
1298              const UChar *srcChars,
1299              int32_t srcStart,
1300              int32_t srcLength)
1301 {
1302   if(!isWritable()) {
1303     return *this;
1304   }
1305 
1306   int32_t oldLength = this->length();
1307 
1308   // optimize (read-only alias).remove(0, start) and .remove(start, end)
1309   if((fFlags&kBufferIsReadonly) && srcLength == 0) {
1310     if(start == 0) {
1311       // remove prefix by adjusting the array pointer
1312       pinIndex(length);
1313       fUnion.fFields.fArray += length;
1314       fUnion.fFields.fCapacity -= length;
1315       setLength(oldLength - length);
1316       return *this;
1317     } else {
1318       pinIndex(start);
1319       if(length >= (oldLength - start)) {
1320         // remove suffix by reducing the length (like truncate())
1321         setLength(start);
1322         fUnion.fFields.fCapacity = start;  // not NUL-terminated any more
1323         return *this;
1324       }
1325     }
1326   }
1327 
1328   if(srcChars == 0) {
1329     srcStart = srcLength = 0;
1330   } else if(srcLength < 0) {
1331     // get the srcLength if necessary
1332     srcLength = u_strlen(srcChars + srcStart);
1333   }
1334 
1335   // calculate the size of the string after the replace
1336   int32_t newLength;
1337 
1338   // optimize append() onto a large-enough, owned string
1339   if(start >= oldLength) {
1340     if(srcLength == 0) {
1341       return *this;
1342     }
1343     newLength = oldLength + srcLength;
1344     if(newLength <= getCapacity() && isBufferWritable()) {
1345       UChar *oldArray = getArrayStart();
1346       // Do not copy characters when
1347       //   UChar *buffer=str.getAppendBuffer(...);
1348       // is followed by
1349       //   str.append(buffer, length);
1350       // or
1351       //   str.appendString(buffer, length)
1352       // or similar.
1353       if(srcChars + srcStart != oldArray + start || start > oldLength) {
1354         us_arrayCopy(srcChars, srcStart, oldArray, oldLength, srcLength);
1355       }
1356       setLength(newLength);
1357       return *this;
1358     } else {
1359       // pin the indices to legal values
1360       start = oldLength;
1361       length = 0;
1362     }
1363   } else {
1364     // pin the indices to legal values
1365     pinIndices(start, length);
1366 
1367     newLength = oldLength - length + srcLength;
1368   }
1369 
1370   // the following may change fArray but will not copy the current contents;
1371   // therefore we need to keep the current fArray
1372   UChar oldStackBuffer[US_STACKBUF_SIZE];
1373   UChar *oldArray;
1374   if((fFlags&kUsingStackBuffer) && (newLength > US_STACKBUF_SIZE)) {
1375     // copy the stack buffer contents because it will be overwritten with
1376     // fUnion.fFields values
1377     u_memcpy(oldStackBuffer, fUnion.fStackBuffer, oldLength);
1378     oldArray = oldStackBuffer;
1379   } else {
1380     oldArray = getArrayStart();
1381   }
1382 
1383   // clone our array and allocate a bigger array if needed
1384   int32_t *bufferToDelete = 0;
1385   if(!cloneArrayIfNeeded(newLength, newLength + (newLength >> 2) + kGrowSize,
1386                          FALSE, &bufferToDelete)
1387   ) {
1388     return *this;
1389   }
1390 
1391   // now do the replace
1392 
1393   UChar *newArray = getArrayStart();
1394   if(newArray != oldArray) {
1395     // if fArray changed, then we need to copy everything except what will change
1396     us_arrayCopy(oldArray, 0, newArray, 0, start);
1397     us_arrayCopy(oldArray, start + length,
1398                  newArray, start + srcLength,
1399                  oldLength - (start + length));
1400   } else if(length != srcLength) {
1401     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1402     us_arrayCopy(oldArray, start + length,
1403                  newArray, start + srcLength,
1404                  oldLength - (start + length));
1405   }
1406 
1407   // now fill in the hole with the new string
1408   us_arrayCopy(srcChars, srcStart, newArray, start, srcLength);
1409 
1410   setLength(newLength);
1411 
1412   // delayed delete in case srcChars == fArray when we started, and
1413   // to keep oldArray alive for the above operations
1414   if (bufferToDelete) {
1415     uprv_free(bufferToDelete);
1416   }
1417 
1418   return *this;
1419 }
1420 
1421 /**
1422  * Replaceable API
1423  */
1424 void
handleReplaceBetween(int32_t start,int32_t limit,const UnicodeString & text)1425 UnicodeString::handleReplaceBetween(int32_t start,
1426                                     int32_t limit,
1427                                     const UnicodeString& text) {
1428     replaceBetween(start, limit, text);
1429 }
1430 
1431 /**
1432  * Replaceable API
1433  */
1434 void
copy(int32_t start,int32_t limit,int32_t dest)1435 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1436     if (limit <= start) {
1437         return; // Nothing to do; avoid bogus malloc call
1438     }
1439     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1440     // Check to make sure text is not null.
1441     if (text != NULL) {
1442 	    extractBetween(start, limit, text, 0);
1443 	    insert(dest, text, 0, limit - start);
1444 	    uprv_free(text);
1445     }
1446 }
1447 
1448 /**
1449  * Replaceable API
1450  *
1451  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1452  * so we implement this function here.
1453  */
hasMetaData() const1454 UBool Replaceable::hasMetaData() const {
1455     return TRUE;
1456 }
1457 
1458 /**
1459  * Replaceable API
1460  */
hasMetaData() const1461 UBool UnicodeString::hasMetaData() const {
1462     return FALSE;
1463 }
1464 
1465 UnicodeString&
doReverse(int32_t start,int32_t length)1466 UnicodeString::doReverse(int32_t start, int32_t length) {
1467   if(length <= 1 || !cloneArrayIfNeeded()) {
1468     return *this;
1469   }
1470 
1471   // pin the indices to legal values
1472   pinIndices(start, length);
1473   if(length <= 1) {  // pinIndices() might have shrunk the length
1474     return *this;
1475   }
1476 
1477   UChar *left = getArrayStart() + start;
1478   UChar *right = left + length - 1;  // -1 for inclusive boundary (length>=2)
1479   UChar swap;
1480   UBool hasSupplementary = FALSE;
1481 
1482   // Before the loop we know left<right because length>=2.
1483   do {
1484     hasSupplementary |= (UBool)U16_IS_LEAD(swap = *left);
1485     hasSupplementary |= (UBool)U16_IS_LEAD(*left++ = *right);
1486     *right-- = swap;
1487   } while(left < right);
1488   // Make sure to test the middle code unit of an odd-length string.
1489   // Redundant if the length is even.
1490   hasSupplementary |= (UBool)U16_IS_LEAD(*left);
1491 
1492   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1493   if(hasSupplementary) {
1494     UChar swap2;
1495 
1496     left = getArrayStart() + start;
1497     right = left + length - 1; // -1 so that we can look at *(left+1) if left<right
1498     while(left < right) {
1499       if(U16_IS_TRAIL(swap = *left) && U16_IS_LEAD(swap2 = *(left + 1))) {
1500         *left++ = swap2;
1501         *left++ = swap;
1502       } else {
1503         ++left;
1504       }
1505     }
1506   }
1507 
1508   return *this;
1509 }
1510 
1511 UBool
padLeading(int32_t targetLength,UChar padChar)1512 UnicodeString::padLeading(int32_t targetLength,
1513                           UChar padChar)
1514 {
1515   int32_t oldLength = length();
1516   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1517     return FALSE;
1518   } else {
1519     // move contents up by padding width
1520     UChar *array = getArrayStart();
1521     int32_t start = targetLength - oldLength;
1522     us_arrayCopy(array, 0, array, start, oldLength);
1523 
1524     // fill in padding character
1525     while(--start >= 0) {
1526       array[start] = padChar;
1527     }
1528     setLength(targetLength);
1529     return TRUE;
1530   }
1531 }
1532 
1533 UBool
padTrailing(int32_t targetLength,UChar padChar)1534 UnicodeString::padTrailing(int32_t targetLength,
1535                            UChar padChar)
1536 {
1537   int32_t oldLength = length();
1538   if(oldLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1539     return FALSE;
1540   } else {
1541     // fill in padding character
1542     UChar *array = getArrayStart();
1543     int32_t length = targetLength;
1544     while(--length >= oldLength) {
1545       array[length] = padChar;
1546     }
1547     setLength(targetLength);
1548     return TRUE;
1549   }
1550 }
1551 
1552 //========================================
1553 // Hashing
1554 //========================================
1555 int32_t
doHashCode() const1556 UnicodeString::doHashCode() const
1557 {
1558     /* Delegate hash computation to uhash.  This makes UnicodeString
1559      * hashing consistent with UChar* hashing.  */
1560     int32_t hashCode = ustr_hashUCharsN(getArrayStart(), length());
1561     if (hashCode == kInvalidHashCode) {
1562         hashCode = kEmptyHashCode;
1563     }
1564     return hashCode;
1565 }
1566 
1567 //========================================
1568 // External Buffer
1569 //========================================
1570 
1571 UChar *
getBuffer(int32_t minCapacity)1572 UnicodeString::getBuffer(int32_t minCapacity) {
1573   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1574     fFlags|=kOpenGetBuffer;
1575     fShortLength=0;
1576     return getArrayStart();
1577   } else {
1578     return 0;
1579   }
1580 }
1581 
1582 void
releaseBuffer(int32_t newLength)1583 UnicodeString::releaseBuffer(int32_t newLength) {
1584   if(fFlags&kOpenGetBuffer && newLength>=-1) {
1585     // set the new fLength
1586     int32_t capacity=getCapacity();
1587     if(newLength==-1) {
1588       // the new length is the string length, capped by fCapacity
1589       const UChar *array=getArrayStart(), *p=array, *limit=array+capacity;
1590       while(p<limit && *p!=0) {
1591         ++p;
1592       }
1593       newLength=(int32_t)(p-array);
1594     } else if(newLength>capacity) {
1595       newLength=capacity;
1596     }
1597     setLength(newLength);
1598     fFlags&=~kOpenGetBuffer;
1599   }
1600 }
1601 
1602 //========================================
1603 // Miscellaneous
1604 //========================================
1605 UBool
cloneArrayIfNeeded(int32_t newCapacity,int32_t growCapacity,UBool doCopyArray,int32_t ** pBufferToDelete,UBool forceClone)1606 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1607                                   int32_t growCapacity,
1608                                   UBool doCopyArray,
1609                                   int32_t **pBufferToDelete,
1610                                   UBool forceClone) {
1611   // default parameters need to be static, therefore
1612   // the defaults are -1 to have convenience defaults
1613   if(newCapacity == -1) {
1614     newCapacity = getCapacity();
1615   }
1616 
1617   // while a getBuffer(minCapacity) is "open",
1618   // prevent any modifications of the string by returning FALSE here
1619   // if the string is bogus, then only an assignment or similar can revive it
1620   if(!isWritable()) {
1621     return FALSE;
1622   }
1623 
1624   /*
1625    * We need to make a copy of the array if
1626    * the buffer is read-only, or
1627    * the buffer is refCounted (shared), and refCount>1, or
1628    * the buffer is too small.
1629    * Return FALSE if memory could not be allocated.
1630    */
1631   if(forceClone ||
1632      fFlags & kBufferIsReadonly ||
1633      (fFlags & kRefCounted && refCount() > 1) ||
1634      newCapacity > getCapacity()
1635   ) {
1636     // check growCapacity for default value and use of the stack buffer
1637     if(growCapacity < 0) {
1638       growCapacity = newCapacity;
1639     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1640       growCapacity = US_STACKBUF_SIZE;
1641     }
1642 
1643     // save old values
1644     UChar oldStackBuffer[US_STACKBUF_SIZE];
1645     UChar *oldArray;
1646     uint8_t flags = fFlags;
1647 
1648     if(flags&kUsingStackBuffer) {
1649       U_ASSERT(!(flags&kRefCounted)); /* kRefCounted and kUsingStackBuffer are mutally exclusive */
1650       if(doCopyArray && growCapacity > US_STACKBUF_SIZE) {
1651         // copy the stack buffer contents because it will be overwritten with
1652         // fUnion.fFields values
1653         us_arrayCopy(fUnion.fStackBuffer, 0, oldStackBuffer, 0, fShortLength);
1654         oldArray = oldStackBuffer;
1655       } else {
1656         oldArray = 0; // no need to copy from stack buffer to itself
1657       }
1658     } else {
1659       oldArray = fUnion.fFields.fArray;
1660       U_ASSERT(oldArray!=NULL); /* when stack buffer is not used, oldArray must have a non-NULL reference */
1661     }
1662 
1663     // allocate a new array
1664     if(allocate(growCapacity) ||
1665        (newCapacity < growCapacity && allocate(newCapacity))
1666     ) {
1667       if(doCopyArray && oldArray != 0) {
1668         // copy the contents
1669         // do not copy more than what fits - it may be smaller than before
1670         int32_t minLength = length();
1671         newCapacity = getCapacity();
1672         if(newCapacity < minLength) {
1673           minLength = newCapacity;
1674           setLength(minLength);
1675         }
1676         us_arrayCopy(oldArray, 0, getArrayStart(), 0, minLength);
1677       } else {
1678         fShortLength = 0;
1679       }
1680 
1681       // release the old array
1682       if(flags & kRefCounted) {
1683         // the array is refCounted; decrement and release if 0
1684         int32_t *pRefCount = ((int32_t *)oldArray - 1);
1685         if(umtx_atomic_dec(pRefCount) == 0) {
1686           if(pBufferToDelete == 0) {
1687             uprv_free(pRefCount);
1688           } else {
1689             // the caller requested to delete it himself
1690             *pBufferToDelete = pRefCount;
1691           }
1692         }
1693       }
1694     } else {
1695       // not enough memory for growCapacity and not even for the smaller newCapacity
1696       // reset the old values for setToBogus() to release the array
1697       if(!(flags&kUsingStackBuffer)) {
1698         fUnion.fFields.fArray = oldArray;
1699       }
1700       fFlags = flags;
1701       setToBogus();
1702       return FALSE;
1703     }
1704   }
1705   return TRUE;
1706 }
1707 
1708 // UnicodeStringAppendable ------------------------------------------------- ***
1709 
~UnicodeStringAppendable()1710 UnicodeStringAppendable::~UnicodeStringAppendable() {}
1711 
1712 UBool
appendCodeUnit(UChar c)1713 UnicodeStringAppendable::appendCodeUnit(UChar c) {
1714   return str.doReplace(str.length(), 0, &c, 0, 1).isWritable();
1715 }
1716 
1717 UBool
appendCodePoint(UChar32 c)1718 UnicodeStringAppendable::appendCodePoint(UChar32 c) {
1719   UChar buffer[U16_MAX_LENGTH];
1720   int32_t cLength = 0;
1721   UBool isError = FALSE;
1722   U16_APPEND(buffer, cLength, U16_MAX_LENGTH, c, isError);
1723   return !isError && str.doReplace(str.length(), 0, buffer, 0, cLength).isWritable();
1724 }
1725 
1726 UBool
appendString(const UChar * s,int32_t length)1727 UnicodeStringAppendable::appendString(const UChar *s, int32_t length) {
1728   return str.doReplace(str.length(), 0, s, 0, length).isWritable();
1729 }
1730 
1731 UBool
reserveAppendCapacity(int32_t appendCapacity)1732 UnicodeStringAppendable::reserveAppendCapacity(int32_t appendCapacity) {
1733   return str.cloneArrayIfNeeded(str.length() + appendCapacity);
1734 }
1735 
1736 UChar *
getAppendBuffer(int32_t minCapacity,int32_t desiredCapacityHint,UChar * scratch,int32_t scratchCapacity,int32_t * resultCapacity)1737 UnicodeStringAppendable::getAppendBuffer(int32_t minCapacity,
1738                                          int32_t desiredCapacityHint,
1739                                          UChar *scratch, int32_t scratchCapacity,
1740                                          int32_t *resultCapacity) {
1741   if(minCapacity < 1 || scratchCapacity < minCapacity) {
1742     *resultCapacity = 0;
1743     return NULL;
1744   }
1745   int32_t oldLength = str.length();
1746   if(str.cloneArrayIfNeeded(oldLength + minCapacity, oldLength + desiredCapacityHint)) {
1747     *resultCapacity = str.getCapacity() - oldLength;
1748     return str.getArrayStart() + oldLength;
1749   }
1750   *resultCapacity = scratchCapacity;
1751   return scratch;
1752 }
1753 
1754 U_NAMESPACE_END
1755 
1756 U_NAMESPACE_USE
1757 
1758 U_CAPI int32_t U_EXPORT2
uhash_hashUnicodeString(const UElement key)1759 uhash_hashUnicodeString(const UElement key) {
1760     const UnicodeString *str = (const UnicodeString*) key.pointer;
1761     return (str == NULL) ? 0 : str->hashCode();
1762 }
1763 
1764 // Moved here from uhash_us.cpp so that using a UVector of UnicodeString*
1765 // does not depend on hashtable code.
1766 U_CAPI UBool U_EXPORT2
uhash_compareUnicodeString(const UElement key1,const UElement key2)1767 uhash_compareUnicodeString(const UElement key1, const UElement key2) {
1768     const UnicodeString *str1 = (const UnicodeString*) key1.pointer;
1769     const UnicodeString *str2 = (const UnicodeString*) key2.pointer;
1770     if (str1 == str2) {
1771         return TRUE;
1772     }
1773     if (str1 == NULL || str2 == NULL) {
1774         return FALSE;
1775     }
1776     return *str1 == *str2;
1777 }
1778 
1779 #ifdef U_STATIC_IMPLEMENTATION
1780 /*
1781 This should never be called. It is defined here to make sure that the
1782 virtual vector deleting destructor is defined within unistr.cpp.
1783 The vector deleting destructor is already a part of UObject,
1784 but defining it here makes sure that it is included with this object file.
1785 This makes sure that static library dependencies are kept to a minimum.
1786 */
uprv_UnicodeStringDummy(void)1787 static void uprv_UnicodeStringDummy(void) {
1788     delete [] (new UnicodeString[2]);
1789 }
1790 #endif
1791