• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ******************************************************************************
3 * Copyright (C) 1999-2007, International Business Machines Corporation and   *
4 * others. All Rights Reserved.                                               *
5 ******************************************************************************
6 *
7 * File unistr.cpp
8 *
9 * Modification History:
10 *
11 *   Date        Name        Description
12 *   09/25/98    stephen     Creation.
13 *   04/20/99    stephen     Overhauled per 4/16 code review.
14 *   07/09/99    stephen     Renamed {hi,lo},{byte,word} to icu_X for HP/UX
15 *   11/18/99    aliu        Added handleReplaceBetween() to make inherit from
16 *                           Replaceable.
17 *   06/25/01    grhoten     Removed the dependency on iostream
18 ******************************************************************************
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/putil.h"
23 #include "cstring.h"
24 #include "cmemory.h"
25 #include "unicode/ustring.h"
26 #include "unicode/unistr.h"
27 #include "uhash.h"
28 #include "ustr_imp.h"
29 #include "umutex.h"
30 
31 #if 0
32 
33 #if U_IOSTREAM_SOURCE >= 199711
34 #include <iostream>
35 using namespace std;
36 #elif U_IOSTREAM_SOURCE >= 198506
37 #include <iostream.h>
38 #endif
39 
40 //DEBUGGING
41 void
42 print(const UnicodeString& s,
43       const char *name)
44 {
45   UChar c;
46   cout << name << ":|";
47   for(int i = 0; i < s.length(); ++i) {
48     c = s[i];
49     if(c>= 0x007E || c < 0x0020)
50       cout << "[0x" << hex << s[i] << "]";
51     else
52       cout << (char) s[i];
53   }
54   cout << '|' << endl;
55 }
56 
57 void
58 print(const UChar *s,
59       int32_t len,
60       const char *name)
61 {
62   UChar c;
63   cout << name << ":|";
64   for(int i = 0; i < len; ++i) {
65     c = s[i];
66     if(c>= 0x007E || c < 0x0020)
67       cout << "[0x" << hex << s[i] << "]";
68     else
69       cout << (char) s[i];
70   }
71   cout << '|' << endl;
72 }
73 // END DEBUGGING
74 #endif
75 
76 // Local function definitions for now
77 
78 // need to copy areas that may overlap
79 static
80 inline void
us_arrayCopy(const UChar * src,int32_t srcStart,UChar * dst,int32_t dstStart,int32_t count)81 us_arrayCopy(const UChar *src, int32_t srcStart,
82          UChar *dst, int32_t dstStart, int32_t count)
83 {
84   if(count>0) {
85     uprv_memmove(dst+dstStart, src+srcStart, (size_t)(count*sizeof(*src)));
86   }
87 }
88 
89 // u_unescapeAt() callback to get a UChar from a UnicodeString
90 U_CDECL_BEGIN
91 static UChar U_CALLCONV
UnicodeString_charAt(int32_t offset,void * context)92 UnicodeString_charAt(int32_t offset, void *context) {
93     return ((U_NAMESPACE_QUALIFIER UnicodeString*) context)->charAt(offset);
94 }
95 U_CDECL_END
96 
97 U_NAMESPACE_BEGIN
98 
99 /* The Replaceable virtual destructor can't be defined in the header
100    due to how AIX works with multiple definitions of virtual functions.
101 */
~Replaceable()102 Replaceable::~Replaceable() {}
Replaceable()103 Replaceable::Replaceable() {}
104 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(UnicodeString)
105 
106 UnicodeString U_EXPORT2
107 operator+ (const UnicodeString &s1, const UnicodeString &s2) {
108     return
109         UnicodeString(s1.length()+s2.length()+1, (UChar32)0, 0).
110             append(s1).
111                 append(s2);
112 }
113 
114 //========================================
115 // Reference Counting functions, put at top of file so that optimizing compilers
116 //                               have a chance to automatically inline.
117 //========================================
118 
119 void
addRef()120 UnicodeString::addRef()
121 {  umtx_atomic_inc((int32_t *)fArray - 1);}
122 
123 int32_t
removeRef()124 UnicodeString::removeRef()
125 { return umtx_atomic_dec((int32_t *)fArray - 1);}
126 
127 int32_t
refCount() const128 UnicodeString::refCount() const
129 {
130     umtx_lock(NULL);
131     // Note: without the lock to force a memory barrier, we might see a very
132     //       stale value on some multi-processor systems.
133     int32_t  count = *((int32_t *)fArray - 1);
134     umtx_unlock(NULL);
135     return count;
136  }
137 
138 void
releaseArray()139 UnicodeString::releaseArray() {
140   if((fFlags & kRefCounted) && removeRef() == 0) {
141     uprv_free((int32_t *)fArray - 1);
142   }
143 }
144 
145 
146 
147 //========================================
148 // Constructors
149 //========================================
UnicodeString()150 UnicodeString::UnicodeString()
151   : fLength(0),
152     fCapacity(US_STACKBUF_SIZE),
153     fArray(fStackBuffer),
154     fFlags(kShortString)
155 {}
156 
UnicodeString(int32_t capacity,UChar32 c,int32_t count)157 UnicodeString::UnicodeString(int32_t capacity, UChar32 c, int32_t count)
158   : fLength(0),
159     fCapacity(US_STACKBUF_SIZE),
160     fArray(0),
161     fFlags(0)
162 {
163   if(count <= 0 || (uint32_t)c > 0x10ffff) {
164     // just allocate and do not do anything else
165     allocate(capacity);
166   } else {
167     // count > 0, allocate and fill the new string with count c's
168     int32_t unitCount = UTF_CHAR_LENGTH(c), length = count * unitCount;
169     if(capacity < length) {
170       capacity = length;
171     }
172     if(allocate(capacity)) {
173       int32_t i = 0;
174 
175       // fill the new string with c
176       if(unitCount == 1) {
177         // fill with length UChars
178         while(i < length) {
179           fArray[i++] = (UChar)c;
180         }
181       } else {
182         // get the code units for c
183         UChar units[UTF_MAX_CHAR_LENGTH];
184         UTF_APPEND_CHAR_UNSAFE(units, i, c);
185 
186         // now it must be i==unitCount
187         i = 0;
188 
189         // for Unicode, unitCount can only be 1, 2, 3, or 4
190         // 1 is handled above
191         while(i < length) {
192           int32_t unitIdx = 0;
193           while(unitIdx < unitCount) {
194             fArray[i++]=units[unitIdx++];
195           }
196         }
197       }
198     }
199     fLength = length;
200   }
201 }
202 
UnicodeString(UChar ch)203 UnicodeString::UnicodeString(UChar ch)
204   : fLength(1),
205     fCapacity(US_STACKBUF_SIZE),
206     fArray(fStackBuffer),
207     fFlags(kShortString)
208 {
209   fStackBuffer[0] = ch;
210 }
211 
UnicodeString(UChar32 ch)212 UnicodeString::UnicodeString(UChar32 ch)
213   : fLength(1),
214     fCapacity(US_STACKBUF_SIZE),
215     fArray(fStackBuffer),
216     fFlags(kShortString)
217 {
218   int32_t i = 0;
219   UBool isError = FALSE;
220   U16_APPEND(fStackBuffer, i, US_STACKBUF_SIZE, ch, isError);
221   fLength = i;
222 }
223 
UnicodeString(const UChar * text)224 UnicodeString::UnicodeString(const UChar *text)
225   : fLength(0),
226     fCapacity(US_STACKBUF_SIZE),
227     fArray(fStackBuffer),
228     fFlags(kShortString)
229 {
230   doReplace(0, 0, text, 0, -1);
231 }
232 
UnicodeString(const UChar * text,int32_t textLength)233 UnicodeString::UnicodeString(const UChar *text,
234                              int32_t textLength)
235   : fLength(0),
236     fCapacity(US_STACKBUF_SIZE),
237     fArray(fStackBuffer),
238     fFlags(kShortString)
239 {
240   doReplace(0, 0, text, 0, textLength);
241 }
242 
UnicodeString(UBool isTerminated,const UChar * text,int32_t textLength)243 UnicodeString::UnicodeString(UBool isTerminated,
244                              const UChar *text,
245                              int32_t textLength)
246   : fLength(textLength),
247     fCapacity(isTerminated ? textLength + 1 : textLength),
248     fArray((UChar *)text),
249     fFlags(kReadonlyAlias)
250 {
251   if(text == NULL) {
252     // treat as an empty string, do not alias
253     fLength = 0;
254     fCapacity = US_STACKBUF_SIZE;
255     fArray = fStackBuffer;
256     fFlags = kShortString;
257   } else if(textLength < -1 ||
258             (textLength == -1 && !isTerminated) ||
259             (textLength >= 0 && isTerminated && text[textLength] != 0)
260   ) {
261     setToBogus();
262   } else if(textLength == -1) {
263     // text is terminated, or else it would have failed the above test
264     fLength = u_strlen(text);
265     fCapacity = fLength + 1;
266   }
267 }
268 
UnicodeString(UChar * buff,int32_t buffLength,int32_t buffCapacity)269 UnicodeString::UnicodeString(UChar *buff,
270                              int32_t buffLength,
271                              int32_t buffCapacity)
272   : fLength(buffLength),
273     fCapacity(buffCapacity),
274     fArray(buff),
275     fFlags(kWritableAlias)
276 {
277   if(buff == NULL) {
278     // treat as an empty string, do not alias
279     fLength = 0;
280     fCapacity = US_STACKBUF_SIZE;
281     fArray = fStackBuffer;
282     fFlags = kShortString;
283   } else if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
284     setToBogus();
285   } else if(buffLength == -1) {
286     // fLength = u_strlen(buff); but do not look beyond buffCapacity
287     const UChar *p = buff, *limit = buff + buffCapacity;
288     while(p != limit && *p != 0) {
289       ++p;
290     }
291     fLength = (int32_t)(p - buff);
292   }
293 }
294 
UnicodeString(const char * src,int32_t length,EInvariant)295 UnicodeString::UnicodeString(const char *src, int32_t length, EInvariant)
296   : fLength(0),
297     fCapacity(US_STACKBUF_SIZE),
298     fArray(fStackBuffer),
299     fFlags(kShortString)
300 {
301   if(src==NULL) {
302     // treat as an empty string
303   } else {
304     if(length<0) {
305       length=(int32_t)uprv_strlen(src);
306     }
307     if(cloneArrayIfNeeded(length, length, FALSE)) {
308       u_charsToUChars(src, getArrayStart(), length);
309       fLength = length;
310     } else {
311       setToBogus();
312     }
313   }
314 }
315 
UnicodeString(const UnicodeString & that)316 UnicodeString::UnicodeString(const UnicodeString& that)
317   : Replaceable(),
318     fLength(0),
319     fCapacity(US_STACKBUF_SIZE),
320     fArray(fStackBuffer),
321     fFlags(kShortString)
322 {
323   copyFrom(that);
324 }
325 
UnicodeString(const UnicodeString & that,int32_t srcStart)326 UnicodeString::UnicodeString(const UnicodeString& that,
327                              int32_t srcStart)
328   : Replaceable(),
329     fLength(0),
330     fCapacity(US_STACKBUF_SIZE),
331     fArray(fStackBuffer),
332     fFlags(kShortString)
333 {
334   setTo(that, srcStart);
335 }
336 
UnicodeString(const UnicodeString & that,int32_t srcStart,int32_t srcLength)337 UnicodeString::UnicodeString(const UnicodeString& that,
338                              int32_t srcStart,
339                              int32_t srcLength)
340   : Replaceable(),
341     fLength(0),
342     fCapacity(US_STACKBUF_SIZE),
343     fArray(fStackBuffer),
344     fFlags(kShortString)
345 {
346   setTo(that, srcStart, srcLength);
347 }
348 
349 // Replaceable base class clone() default implementation, does not clone
350 Replaceable *
clone() const351 Replaceable::clone() const {
352   return NULL;
353 }
354 
355 // UnicodeString overrides clone() with a real implementation
356 Replaceable *
clone() const357 UnicodeString::clone() const {
358   return new UnicodeString(*this);
359 }
360 
361 //========================================
362 // array allocation
363 //========================================
364 
365 UBool
allocate(int32_t capacity)366 UnicodeString::allocate(int32_t capacity) {
367   if(capacity <= US_STACKBUF_SIZE) {
368     fArray = fStackBuffer;
369     fCapacity = US_STACKBUF_SIZE;
370     fFlags = kShortString;
371   } else {
372     // count bytes for the refCounter and the string capacity, and
373     // round up to a multiple of 16; then divide by 4 and allocate int32_t's
374     // to be safely aligned for the refCount
375     int32_t words = (int32_t)(((sizeof(int32_t) + capacity * U_SIZEOF_UCHAR + 15) & ~15) >> 2);
376     int32_t *array = (int32_t*) uprv_malloc( sizeof(int32_t) * words );
377     if(array != 0) {
378       // set initial refCount and point behind the refCount
379       *array++ = 1;
380 
381       // have fArray point to the first UChar
382       fArray = (UChar *)array;
383       fCapacity = (int32_t)((words - 1) * (sizeof(int32_t) / U_SIZEOF_UCHAR));
384       fFlags = kLongString;
385     } else {
386       fLength = 0;
387       fCapacity = 0;
388       fFlags = kIsBogus;
389       return FALSE;
390     }
391   }
392   return TRUE;
393 }
394 
395 //========================================
396 // Destructor
397 //========================================
~UnicodeString()398 UnicodeString::~UnicodeString()
399 {
400   releaseArray();
401 }
402 
403 
404 //========================================
405 // Assignment
406 //========================================
407 
408 UnicodeString &
operator =(const UnicodeString & src)409 UnicodeString::operator=(const UnicodeString &src) {
410   return copyFrom(src);
411 }
412 
413 UnicodeString &
fastCopyFrom(const UnicodeString & src)414 UnicodeString::fastCopyFrom(const UnicodeString &src) {
415   return copyFrom(src, TRUE);
416 }
417 
418 UnicodeString &
copyFrom(const UnicodeString & src,UBool fastCopy)419 UnicodeString::copyFrom(const UnicodeString &src, UBool fastCopy) {
420   // if assigning to ourselves, do nothing
421   if(this == 0 || this == &src) {
422     return *this;
423   }
424 
425   // is the right side bogus?
426   if(&src == 0 || src.isBogus()) {
427     setToBogus();
428     return *this;
429   }
430 
431   // delete the current contents
432   releaseArray();
433 
434   // we always copy the length
435   fLength = src.fLength;
436   if(fLength == 0) {
437     // empty string - use the stack buffer
438     fArray = fStackBuffer;
439     fCapacity = US_STACKBUF_SIZE;
440     fFlags = kShortString;
441     return *this;
442   }
443 
444   // fLength>0 and not an "open" src.getBuffer(minCapacity)
445   switch(src.fFlags) {
446   case kShortString:
447     // short string using the stack buffer, do the same
448     fArray = fStackBuffer;
449     fCapacity = US_STACKBUF_SIZE;
450     fFlags = kShortString;
451     uprv_memcpy(fStackBuffer, src.fArray, fLength * U_SIZEOF_UCHAR);
452     break;
453   case kLongString:
454     // src uses a refCounted string buffer, use that buffer with refCount
455     // src is const, use a cast - we don't really change it
456     ((UnicodeString &)src).addRef();
457     // copy all fields, share the reference-counted buffer
458     fArray = src.fArray;
459     fCapacity = src.fCapacity;
460     fFlags = src.fFlags;
461     break;
462   case kReadonlyAlias:
463     if(fastCopy) {
464       // src is a readonly alias, do the same
465       // -> maintain the readonly alias as such
466       fArray = src.fArray;
467       fCapacity = src.fCapacity;
468       fFlags = src.fFlags;
469       break;
470     }
471     // else if(!fastCopy) fall through to case kWritableAlias
472     // -> allocate a new buffer and copy the contents
473   case kWritableAlias:
474     // src is a writable alias; we make a copy of that instead
475     if(allocate(fLength)) {
476       uprv_memcpy(fArray, src.fArray, fLength * U_SIZEOF_UCHAR);
477       break;
478     }
479     // if there is not enough memory, then fall through to setting to bogus
480   default:
481     // if src is bogus, set ourselves to bogus
482     // do not call setToBogus() here because fArray and fFlags are not consistent here
483     fArray = 0;
484     fLength = 0;
485     fCapacity = 0;
486     fFlags = kIsBogus;
487     break;
488   }
489 
490   return *this;
491 }
492 
493 //========================================
494 // Miscellaneous operations
495 //========================================
496 
unescape() const497 UnicodeString UnicodeString::unescape() const {
498     UnicodeString result;
499     for (int32_t i=0; i<length(); ) {
500         UChar32 c = charAt(i++);
501         if (c == 0x005C /*'\\'*/) {
502             c = unescapeAt(i); // advances i
503             if (c == (UChar32)0xFFFFFFFF) {
504                 result.remove(); // return empty string
505                 break; // invalid escape sequence
506             }
507         }
508         result.append(c);
509     }
510     return result;
511 }
512 
unescapeAt(int32_t & offset) const513 UChar32 UnicodeString::unescapeAt(int32_t &offset) const {
514     return u_unescapeAt(UnicodeString_charAt, &offset, length(), (void*)this);
515 }
516 
517 //========================================
518 // Read-only implementation
519 //========================================
520 int8_t
doCompare(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const521 UnicodeString::doCompare( int32_t start,
522               int32_t length,
523               const UChar *srcChars,
524               int32_t srcStart,
525               int32_t srcLength) const
526 {
527   // compare illegal string values
528   // treat const UChar *srcChars==NULL as an empty string
529   if(isBogus()) {
530     return -1;
531   }
532 
533   // pin indices to legal values
534   pinIndices(start, length);
535 
536   if(srcChars == NULL) {
537     srcStart = srcLength = 0;
538   }
539 
540   // get the correct pointer
541   const UChar *chars = getArrayStart();
542 
543   chars += start;
544   srcChars += srcStart;
545 
546   int32_t minLength;
547   int8_t lengthResult;
548 
549   // get the srcLength if necessary
550   if(srcLength < 0) {
551     srcLength = u_strlen(srcChars + srcStart);
552   }
553 
554   // are we comparing different lengths?
555   if(length != srcLength) {
556     if(length < srcLength) {
557       minLength = length;
558       lengthResult = -1;
559     } else {
560       minLength = srcLength;
561       lengthResult = 1;
562     }
563   } else {
564     minLength = length;
565     lengthResult = 0;
566   }
567 
568   /*
569    * note that uprv_memcmp() returns an int but we return an int8_t;
570    * we need to take care not to truncate the result -
571    * one way to do this is to right-shift the value to
572    * move the sign bit into the lower 8 bits and making sure that this
573    * does not become 0 itself
574    */
575 
576   if(minLength > 0 && chars != srcChars) {
577     int32_t result;
578 
579 #   if U_IS_BIG_ENDIAN
580       // big-endian: byte comparison works
581       result = uprv_memcmp(chars, srcChars, minLength * sizeof(UChar));
582       if(result != 0) {
583         return (int8_t)(result >> 15 | 1);
584       }
585 #   else
586       // little-endian: compare UChar units
587       do {
588         result = ((int32_t)*(chars++) - (int32_t)*(srcChars++));
589         if(result != 0) {
590           return (int8_t)(result >> 15 | 1);
591         }
592       } while(--minLength > 0);
593 #   endif
594   }
595   return lengthResult;
596 }
597 
598 /* String compare in code point order - doCompare() compares in code unit order. */
599 int8_t
doCompareCodePointOrder(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength) const600 UnicodeString::doCompareCodePointOrder(int32_t start,
601                                        int32_t length,
602                                        const UChar *srcChars,
603                                        int32_t srcStart,
604                                        int32_t srcLength) const
605 {
606   // compare illegal string values
607   // treat const UChar *srcChars==NULL as an empty string
608   if(isBogus()) {
609     return -1;
610   }
611 
612   // pin indices to legal values
613   pinIndices(start, length);
614 
615   if(srcChars == NULL) {
616     srcStart = srcLength = 0;
617   }
618 
619   int32_t diff = uprv_strCompare(fArray + start, length, srcChars + srcStart, srcLength, FALSE, TRUE);
620   /* translate the 32-bit result into an 8-bit one */
621   if(diff!=0) {
622     return (int8_t)(diff >> 15 | 1);
623   } else {
624     return 0;
625   }
626 }
627 
628 int32_t
getLength() const629 UnicodeString::getLength() const {
630     return length();
631 }
632 
633 UChar
getCharAt(int32_t offset) const634 UnicodeString::getCharAt(int32_t offset) const {
635   return charAt(offset);
636 }
637 
638 UChar32
getChar32At(int32_t offset) const639 UnicodeString::getChar32At(int32_t offset) const {
640   return char32At(offset);
641 }
642 
643 int32_t
countChar32(int32_t start,int32_t length) const644 UnicodeString::countChar32(int32_t start, int32_t length) const {
645   pinIndices(start, length);
646   // if(isBogus()) then fArray==0 and start==0 - u_countChar32() checks for NULL
647   return u_countChar32(fArray+start, length);
648 }
649 
650 UBool
hasMoreChar32Than(int32_t start,int32_t length,int32_t number) const651 UnicodeString::hasMoreChar32Than(int32_t start, int32_t length, int32_t number) const {
652   pinIndices(start, length);
653   // if(isBogus()) then fArray==0 and start==0 - u_strHasMoreChar32Than() checks for NULL
654   return u_strHasMoreChar32Than(fArray+start, length, number);
655 }
656 
657 int32_t
moveIndex32(int32_t index,int32_t delta) const658 UnicodeString::moveIndex32(int32_t index, int32_t delta) const {
659   // pin index
660   if(index<0) {
661     index=0;
662   } else if(index>fLength) {
663     index=fLength;
664   }
665 
666   if(delta>0) {
667     UTF_FWD_N(fArray, index, fLength, delta);
668   } else {
669     UTF_BACK_N(fArray, 0, index, -delta);
670   }
671 
672   return index;
673 }
674 
675 void
doExtract(int32_t start,int32_t length,UChar * dst,int32_t dstStart) const676 UnicodeString::doExtract(int32_t start,
677              int32_t length,
678              UChar *dst,
679              int32_t dstStart) const
680 {
681   // pin indices to legal values
682   pinIndices(start, length);
683 
684   // do not copy anything if we alias dst itself
685   if(fArray + start != dst + dstStart) {
686     us_arrayCopy(getArrayStart(), start, dst, dstStart, length);
687   }
688 }
689 
690 int32_t
extract(UChar * dest,int32_t destCapacity,UErrorCode & errorCode) const691 UnicodeString::extract(UChar *dest, int32_t destCapacity,
692                        UErrorCode &errorCode) const {
693   if(U_SUCCESS(errorCode)) {
694     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
695       errorCode=U_ILLEGAL_ARGUMENT_ERROR;
696     } else {
697       if(fLength>0 && fLength<=destCapacity && fArray!=dest) {
698         uprv_memcpy(dest, fArray, fLength*U_SIZEOF_UCHAR);
699       }
700       return u_terminateUChars(dest, destCapacity, fLength, &errorCode);
701     }
702   }
703 
704   return fLength;
705 }
706 
707 int32_t
extract(int32_t start,int32_t length,char * target,int32_t targetCapacity,enum EInvariant) const708 UnicodeString::extract(int32_t start,
709                        int32_t length,
710                        char *target,
711                        int32_t targetCapacity,
712                        enum EInvariant) const
713 {
714   // if the arguments are illegal, then do nothing
715   if(targetCapacity < 0 || (targetCapacity > 0 && target == NULL)) {
716     return 0;
717   }
718 
719   // pin the indices to legal values
720   pinIndices(start, length);
721 
722   if(length <= targetCapacity) {
723     u_UCharsToChars(getArrayStart() + start, target, length);
724   }
725   UErrorCode status = U_ZERO_ERROR;
726   return u_terminateChars(target, targetCapacity, length, &status);
727 }
728 
729 void
extractBetween(int32_t start,int32_t limit,UnicodeString & target) const730 UnicodeString::extractBetween(int32_t start,
731                   int32_t limit,
732                   UnicodeString& target) const {
733   pinIndex(start);
734   pinIndex(limit);
735   doExtract(start, limit - start, target);
736 }
737 
738 int32_t
indexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const739 UnicodeString::indexOf(const UChar *srcChars,
740                int32_t srcStart,
741                int32_t srcLength,
742                int32_t start,
743                int32_t length) const
744 {
745   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
746     return -1;
747   }
748 
749   // UnicodeString does not find empty substrings
750   if(srcLength < 0 && srcChars[srcStart] == 0) {
751     return -1;
752   }
753 
754   // get the indices within bounds
755   pinIndices(start, length);
756 
757   // find the first occurrence of the substring
758   const UChar *match = u_strFindFirst(fArray + start, length, srcChars + srcStart, srcLength);
759   if(match == NULL) {
760     return -1;
761   } else {
762     return (int32_t)(match - fArray);
763   }
764 }
765 
766 int32_t
doIndexOf(UChar c,int32_t start,int32_t length) const767 UnicodeString::doIndexOf(UChar c,
768              int32_t start,
769              int32_t length) const
770 {
771   // pin indices
772   pinIndices(start, length);
773 
774   // find the first occurrence of c
775   const UChar *match = u_memchr(fArray + start, c, length);
776   if(match == NULL) {
777     return -1;
778   } else {
779     return (int32_t)(match - fArray);
780   }
781 }
782 
783 int32_t
doIndexOf(UChar32 c,int32_t start,int32_t length) const784 UnicodeString::doIndexOf(UChar32 c,
785                          int32_t start,
786                          int32_t length) const {
787   // pin indices
788   pinIndices(start, length);
789 
790   // find the first occurrence of c
791   const UChar *match = u_memchr32(fArray + start, c, length);
792   if(match == NULL) {
793     return -1;
794   } else {
795     return (int32_t)(match - fArray);
796   }
797 }
798 
799 int32_t
lastIndexOf(const UChar * srcChars,int32_t srcStart,int32_t srcLength,int32_t start,int32_t length) const800 UnicodeString::lastIndexOf(const UChar *srcChars,
801                int32_t srcStart,
802                int32_t srcLength,
803                int32_t start,
804                int32_t length) const
805 {
806   if(isBogus() || srcChars == 0 || srcStart < 0 || srcLength == 0) {
807     return -1;
808   }
809 
810   // UnicodeString does not find empty substrings
811   if(srcLength < 0 && srcChars[srcStart] == 0) {
812     return -1;
813   }
814 
815   // get the indices within bounds
816   pinIndices(start, length);
817 
818   // find the last occurrence of the substring
819   const UChar *match = u_strFindLast(fArray + start, length, srcChars + srcStart, srcLength);
820   if(match == NULL) {
821     return -1;
822   } else {
823     return (int32_t)(match - fArray);
824   }
825 }
826 
827 int32_t
doLastIndexOf(UChar c,int32_t start,int32_t length) const828 UnicodeString::doLastIndexOf(UChar c,
829                  int32_t start,
830                  int32_t length) const
831 {
832   if(isBogus()) {
833     return -1;
834   }
835 
836   // pin indices
837   pinIndices(start, length);
838 
839   // find the last occurrence of c
840   const UChar *match = u_memrchr(fArray + start, c, length);
841   if(match == NULL) {
842     return -1;
843   } else {
844     return (int32_t)(match - fArray);
845   }
846 }
847 
848 int32_t
doLastIndexOf(UChar32 c,int32_t start,int32_t length) const849 UnicodeString::doLastIndexOf(UChar32 c,
850                              int32_t start,
851                              int32_t length) const {
852   // pin indices
853   pinIndices(start, length);
854 
855   // find the last occurrence of c
856   const UChar *match = u_memrchr32(fArray + start, c, length);
857   if(match == NULL) {
858     return -1;
859   } else {
860     return (int32_t)(match - fArray);
861   }
862 }
863 
864 //========================================
865 // Write implementation
866 //========================================
867 
868 UnicodeString&
findAndReplace(int32_t start,int32_t length,const UnicodeString & oldText,int32_t oldStart,int32_t oldLength,const UnicodeString & newText,int32_t newStart,int32_t newLength)869 UnicodeString::findAndReplace(int32_t start,
870                   int32_t length,
871                   const UnicodeString& oldText,
872                   int32_t oldStart,
873                   int32_t oldLength,
874                   const UnicodeString& newText,
875                   int32_t newStart,
876                   int32_t newLength)
877 {
878   if(isBogus() || oldText.isBogus() || newText.isBogus()) {
879     return *this;
880   }
881 
882   pinIndices(start, length);
883   oldText.pinIndices(oldStart, oldLength);
884   newText.pinIndices(newStart, newLength);
885 
886   if(oldLength == 0) {
887     return *this;
888   }
889 
890   while(length > 0 && length >= oldLength) {
891     int32_t pos = indexOf(oldText, oldStart, oldLength, start, length);
892     if(pos < 0) {
893       // no more oldText's here: done
894       break;
895     } else {
896       // we found oldText, replace it by newText and go beyond it
897       replace(pos, oldLength, newText, newStart, newLength);
898       length -= pos + oldLength - start;
899       start = pos + newLength;
900     }
901   }
902 
903   return *this;
904 }
905 
906 
907 void
setToBogus()908 UnicodeString::setToBogus()
909 {
910   releaseArray();
911 
912   fArray = 0;
913   fCapacity = fLength = 0;
914   fFlags = kIsBogus;
915 }
916 
917 // turn a bogus string into an empty one
918 void
unBogus()919 UnicodeString::unBogus() {
920   if(fFlags & kIsBogus) {
921     fArray = fStackBuffer;
922     fLength = 0;
923     fCapacity = US_STACKBUF_SIZE;
924     fFlags = kShortString;
925   }
926 }
927 
928 // setTo() analogous to the readonly-aliasing constructor with the same signature
929 UnicodeString &
setTo(UBool isTerminated,const UChar * text,int32_t textLength)930 UnicodeString::setTo(UBool isTerminated,
931                      const UChar *text,
932                      int32_t textLength)
933 {
934   if(fFlags & kOpenGetBuffer) {
935     // do not modify a string that has an "open" getBuffer(minCapacity)
936     return *this;
937   }
938 
939   if(text == NULL) {
940     // treat as an empty string, do not alias
941     releaseArray();
942     fLength = 0;
943     fCapacity = US_STACKBUF_SIZE;
944     fArray = fStackBuffer;
945     fFlags = kShortString;
946     return *this;
947   }
948 
949   if( textLength < -1 ||
950       (textLength == -1 && !isTerminated) ||
951       (textLength >= 0 && isTerminated && text[textLength] != 0)
952   ) {
953     setToBogus();
954     return *this;
955   }
956 
957   releaseArray();
958 
959   fArray = (UChar *)text;
960   if(textLength != -1) {
961     fLength = textLength;
962     fCapacity = isTerminated ? fLength + 1 : fLength;
963   } else {
964     // text is terminated, or else it would have failed the above test
965     fLength = u_strlen(text);
966     fCapacity = fLength + 1;
967   }
968 
969   fFlags = kReadonlyAlias;
970   return *this;
971 }
972 
973 // setTo() analogous to the writable-aliasing constructor with the same signature
974 UnicodeString &
setTo(UChar * buffer,int32_t buffLength,int32_t buffCapacity)975 UnicodeString::setTo(UChar *buffer,
976                      int32_t buffLength,
977                      int32_t buffCapacity) {
978   if(fFlags & kOpenGetBuffer) {
979     // do not modify a string that has an "open" getBuffer(minCapacity)
980     return *this;
981   }
982 
983   if(buffer == NULL) {
984     // treat as an empty string, do not alias
985     releaseArray();
986     fLength = 0;
987     fCapacity = US_STACKBUF_SIZE;
988     fArray = fStackBuffer;
989     fFlags = kShortString;
990     return *this;
991   }
992 
993   if(buffLength < -1 || buffCapacity < 0 || buffLength > buffCapacity) {
994     setToBogus();
995     return *this;
996   } else if(buffLength == -1) {
997     // buffLength = u_strlen(buff); but do not look beyond buffCapacity
998     const UChar *p = buffer, *limit = buffer + buffCapacity;
999     while(p != limit && *p != 0) {
1000       ++p;
1001     }
1002     buffLength = (int32_t)(p - buffer);
1003   }
1004 
1005   releaseArray();
1006 
1007   fArray = buffer;
1008   fLength = buffLength;
1009   fCapacity = buffCapacity;
1010   fFlags = kWritableAlias;
1011   return *this;
1012 }
1013 
1014 UnicodeString&
setCharAt(int32_t offset,UChar c)1015 UnicodeString::setCharAt(int32_t offset,
1016              UChar c)
1017 {
1018   if(cloneArrayIfNeeded() && fLength > 0) {
1019     if(offset < 0) {
1020       offset = 0;
1021     } else if(offset >= fLength) {
1022       offset = fLength - 1;
1023     }
1024 
1025     fArray[offset] = c;
1026   }
1027   return *this;
1028 }
1029 
1030 UnicodeString&
doReplace(int32_t start,int32_t length,const UnicodeString & src,int32_t srcStart,int32_t srcLength)1031 UnicodeString::doReplace( int32_t start,
1032               int32_t length,
1033               const UnicodeString& src,
1034               int32_t srcStart,
1035               int32_t srcLength)
1036 {
1037   if(!src.isBogus()) {
1038     // pin the indices to legal values
1039     src.pinIndices(srcStart, srcLength);
1040 
1041     // get the characters from src
1042     // and replace the range in ourselves with them
1043     return doReplace(start, length, src.getArrayStart(), srcStart, srcLength);
1044   } else {
1045     // remove the range
1046     return doReplace(start, length, 0, 0, 0);
1047   }
1048 }
1049 
1050 UnicodeString&
doReplace(int32_t start,int32_t length,const UChar * srcChars,int32_t srcStart,int32_t srcLength)1051 UnicodeString::doReplace(int32_t start,
1052              int32_t length,
1053              const UChar *srcChars,
1054              int32_t srcStart,
1055              int32_t srcLength)
1056 {
1057   if(isBogus()) {
1058     return *this;
1059   }
1060 
1061   if(srcChars == 0) {
1062     srcStart = srcLength = 0;
1063   } else if(srcLength < 0) {
1064     // get the srcLength if necessary
1065     srcLength = u_strlen(srcChars + srcStart);
1066   }
1067 
1068   int32_t *bufferToDelete = 0;
1069 
1070   // the following may change fArray but will not copy the current contents;
1071   // therefore we need to keep the current fArray
1072   UChar *oldArray = fArray;
1073   int32_t oldLength = fLength;
1074 
1075   // pin the indices to legal values
1076   pinIndices(start, length);
1077 
1078   // calculate the size of the string after the replace
1079   int32_t newSize = oldLength - length + srcLength;
1080 
1081   // clone our array and allocate a bigger array if needed
1082   if(!cloneArrayIfNeeded(newSize, newSize + (newSize >> 2) + kGrowSize,
1083                          FALSE, &bufferToDelete)
1084   ) {
1085     return *this;
1086   }
1087 
1088   // now do the replace
1089 
1090   if(fArray != oldArray) {
1091     // if fArray changed, then we need to copy everything except what will change
1092     us_arrayCopy(oldArray, 0, fArray, 0, start);
1093     us_arrayCopy(oldArray, start + length,
1094                  fArray, start + srcLength,
1095                  oldLength - (start + length));
1096   } else if(length != srcLength) {
1097     // fArray did not change; copy only the portion that isn't changing, leaving a hole
1098     us_arrayCopy(oldArray, start + length,
1099                  fArray, start + srcLength,
1100                  oldLength - (start + length));
1101   }
1102 
1103   // now fill in the hole with the new string
1104   us_arrayCopy(srcChars, srcStart, getArrayStart(), start, srcLength);
1105 
1106   fLength = newSize;
1107 
1108   // delayed delete in case srcChars == fArray when we started, and
1109   // to keep oldArray alive for the above operations
1110   if (bufferToDelete) {
1111     uprv_free(bufferToDelete);
1112   }
1113 
1114   return *this;
1115 }
1116 
1117 /**
1118  * Replaceable API
1119  */
1120 void
handleReplaceBetween(int32_t start,int32_t limit,const UnicodeString & text)1121 UnicodeString::handleReplaceBetween(int32_t start,
1122                                     int32_t limit,
1123                                     const UnicodeString& text) {
1124     replaceBetween(start, limit, text);
1125 }
1126 
1127 /**
1128  * Replaceable API
1129  */
1130 void
copy(int32_t start,int32_t limit,int32_t dest)1131 UnicodeString::copy(int32_t start, int32_t limit, int32_t dest) {
1132     if (limit <= start) {
1133         return; // Nothing to do; avoid bogus malloc call
1134     }
1135     UChar* text = (UChar*) uprv_malloc( sizeof(UChar) * (limit - start) );
1136     extractBetween(start, limit, text, 0);
1137     insert(dest, text, 0, limit - start);
1138     uprv_free(text);
1139 }
1140 
1141 /**
1142  * Replaceable API
1143  *
1144  * NOTE: This is for the Replaceable class.  There is no rep.cpp,
1145  * so we implement this function here.
1146  */
hasMetaData() const1147 UBool Replaceable::hasMetaData() const {
1148     return TRUE;
1149 }
1150 
1151 /**
1152  * Replaceable API
1153  */
hasMetaData() const1154 UBool UnicodeString::hasMetaData() const {
1155     return FALSE;
1156 }
1157 
1158 UnicodeString&
doReverse(int32_t start,int32_t length)1159 UnicodeString::doReverse(int32_t start,
1160              int32_t length)
1161 {
1162   if(fLength <= 1 || !cloneArrayIfNeeded()) {
1163     return *this;
1164   }
1165 
1166   // pin the indices to legal values
1167   pinIndices(start, length);
1168 
1169   UChar *left = getArrayStart() + start;
1170   UChar *right = getArrayStart() + start + length;
1171   UChar swap;
1172   UBool hasSupplementary = FALSE;
1173 
1174   while(left < --right) {
1175     hasSupplementary |= (UBool)UTF_IS_LEAD(swap = *left);
1176     hasSupplementary |= (UBool)UTF_IS_LEAD(*left++ = *right);
1177     *right = swap;
1178   }
1179 
1180   /* if there are supplementary code points in the reversed range, then re-swap their surrogates */
1181   if(hasSupplementary) {
1182     UChar swap2;
1183 
1184     left = getArrayStart() + start;
1185     right = getArrayStart() + start + length - 1; // -1 so that we can look at *(left+1) if left<right
1186     while(left < right) {
1187       if(UTF_IS_TRAIL(swap = *left) && UTF_IS_LEAD(swap2 = *(left + 1))) {
1188         *left++ = swap2;
1189         *left++ = swap;
1190       } else {
1191         ++left;
1192       }
1193     }
1194   }
1195 
1196   return *this;
1197 }
1198 
1199 UBool
padLeading(int32_t targetLength,UChar padChar)1200 UnicodeString::padLeading(int32_t targetLength,
1201                           UChar padChar)
1202 {
1203   if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1204     return FALSE;
1205   } else {
1206     // move contents up by padding width
1207     int32_t start = targetLength - fLength;
1208     us_arrayCopy(fArray, 0, fArray, start, fLength);
1209 
1210     // fill in padding character
1211     while(--start >= 0) {
1212       fArray[start] = padChar;
1213     }
1214     fLength = targetLength;
1215     return TRUE;
1216   }
1217 }
1218 
1219 UBool
padTrailing(int32_t targetLength,UChar padChar)1220 UnicodeString::padTrailing(int32_t targetLength,
1221                            UChar padChar)
1222 {
1223   if(fLength >= targetLength || !cloneArrayIfNeeded(targetLength)) {
1224     return FALSE;
1225   } else {
1226     // fill in padding character
1227     int32_t length = targetLength;
1228     while(--length >= fLength) {
1229       fArray[length] = padChar;
1230     }
1231     fLength = targetLength;
1232     return TRUE;
1233   }
1234 }
1235 
1236 //========================================
1237 // Hashing
1238 //========================================
1239 int32_t
doHashCode() const1240 UnicodeString::doHashCode() const
1241 {
1242     /* Delegate hash computation to uhash.  This makes UnicodeString
1243      * hashing consistent with UChar* hashing.  */
1244     int32_t hashCode = uhash_hashUCharsN(getArrayStart(), fLength);
1245     if (hashCode == kInvalidHashCode) {
1246         hashCode = kEmptyHashCode;
1247     }
1248     return hashCode;
1249 }
1250 
1251 //========================================
1252 // External Buffer
1253 //========================================
1254 
1255 UChar *
getBuffer(int32_t minCapacity)1256 UnicodeString::getBuffer(int32_t minCapacity) {
1257   if(minCapacity>=-1 && cloneArrayIfNeeded(minCapacity)) {
1258     fFlags|=kOpenGetBuffer;
1259     fLength=0;
1260     return fArray;
1261   } else {
1262     return 0;
1263   }
1264 }
1265 
1266 void
releaseBuffer(int32_t newLength)1267 UnicodeString::releaseBuffer(int32_t newLength) {
1268   if(fFlags&kOpenGetBuffer && newLength>=-1) {
1269     // set the new fLength
1270     if(newLength==-1) {
1271       // the new length is the string length, capped by fCapacity
1272       const UChar *p=fArray, *limit=fArray+fCapacity;
1273       while(p<limit && *p!=0) {
1274         ++p;
1275       }
1276       fLength=(int32_t)(p-fArray);
1277     } else if(newLength<=fCapacity) {
1278       fLength=newLength;
1279     } else {
1280       fLength=fCapacity;
1281     }
1282     fFlags&=~kOpenGetBuffer;
1283   }
1284 }
1285 
1286 //========================================
1287 // Miscellaneous
1288 //========================================
1289 UBool
cloneArrayIfNeeded(int32_t newCapacity,int32_t growCapacity,UBool doCopyArray,int32_t ** pBufferToDelete,UBool forceClone)1290 UnicodeString::cloneArrayIfNeeded(int32_t newCapacity,
1291                                   int32_t growCapacity,
1292                                   UBool doCopyArray,
1293                                   int32_t **pBufferToDelete,
1294                                   UBool forceClone) {
1295   // default parameters need to be static, therefore
1296   // the defaults are -1 to have convenience defaults
1297   if(newCapacity == -1) {
1298     newCapacity = fCapacity;
1299   }
1300 
1301   // while a getBuffer(minCapacity) is "open",
1302   // prevent any modifications of the string by returning FALSE here
1303   // if the string is bogus, then only an assignment or similar can revive it
1304   if((fFlags&(kOpenGetBuffer|kIsBogus))!=0) {
1305     return FALSE;
1306   }
1307 
1308   /*
1309    * We need to make a copy of the array if
1310    * the buffer is read-only, or
1311    * the buffer is refCounted (shared), and refCount>1, or
1312    * the buffer is too small.
1313    * Return FALSE if memory could not be allocated.
1314    */
1315   if(forceClone ||
1316      fFlags & kBufferIsReadonly ||
1317      fFlags & kRefCounted && refCount() > 1 ||
1318      newCapacity > fCapacity
1319   ) {
1320     // save old values
1321     UChar *array = fArray;
1322     uint16_t flags = fFlags;
1323 
1324     // check growCapacity for default value and use of the stack buffer
1325     if(growCapacity == -1) {
1326       growCapacity = newCapacity;
1327     } else if(newCapacity <= US_STACKBUF_SIZE && growCapacity > US_STACKBUF_SIZE) {
1328       growCapacity = US_STACKBUF_SIZE;
1329     }
1330 
1331     // allocate a new array
1332     if(allocate(growCapacity) ||
1333        newCapacity < growCapacity && allocate(newCapacity)
1334     ) {
1335       if(doCopyArray) {
1336         // copy the contents
1337         // do not copy more than what fits - it may be smaller than before
1338         if(fCapacity < fLength) {
1339           fLength = fCapacity;
1340         }
1341         us_arrayCopy(array, 0, fArray, 0, fLength);
1342       } else {
1343         fLength = 0;
1344       }
1345 
1346       // release the old array
1347       if(flags & kRefCounted) {
1348         // the array is refCounted; decrement and release if 0
1349         int32_t *pRefCount = ((int32_t *)array - 1);
1350         if(umtx_atomic_dec(pRefCount) == 0) {
1351           if(pBufferToDelete == 0) {
1352             uprv_free(pRefCount);
1353           } else {
1354             // the caller requested to delete it himself
1355             *pBufferToDelete = pRefCount;
1356           }
1357         }
1358       }
1359     } else {
1360       // not enough memory for growCapacity and not even for the smaller newCapacity
1361       // reset the old values for setToBogus() to release the array
1362       fArray = array;
1363       fFlags = flags;
1364       setToBogus();
1365       return FALSE;
1366     }
1367   }
1368   return TRUE;
1369 }
1370 U_NAMESPACE_END
1371 
1372 #ifdef U_STATIC_IMPLEMENTATION
1373 /*
1374 This should never be called. It is defined here to make sure that the
1375 virtual vector deleting destructor is defined within unistr.cpp.
1376 The vector deleting destructor is already a part of UObject,
1377 but defining it here makes sure that it is included with this object file.
1378 This makes sure that static library dependencies are kept to a minimum.
1379 */
uprv_UnicodeStringDummy(void)1380 static void uprv_UnicodeStringDummy(void) {
1381     U_NAMESPACE_USE
1382     delete [] (new UnicodeString[2]);
1383 }
1384 #endif
1385 
1386