• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * (C) 1999 Lars Knoll (knoll@kde.org)
3  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights reserved.
4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
5  *
6  * This library is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Library General Public
8  * License as published by the Free Software Foundation; either
9  * version 2 of the License, or (at your option) any later version.
10  *
11  * This library is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Library General Public License for more details.
15  *
16  * You should have received a copy of the GNU Library General Public License
17  * along with this library; see the file COPYING.LIB.  If not, write to
18  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19  * Boston, MA 02110-1301, USA.
20  */
21 
22 #include "config.h"
23 #include "WTFString.h"
24 
25 #include "IntegerToStringConversion.h"
26 #include <stdarg.h>
27 #include "wtf/ASCIICType.h"
28 #include "wtf/DataLog.h"
29 #include "wtf/HexNumber.h"
30 #include "wtf/MathExtras.h"
31 #include "wtf/text/CString.h"
32 #include "wtf/StringExtras.h"
33 #include "wtf/Vector.h"
34 #include "wtf/dtoa.h"
35 #include "wtf/unicode/CharacterNames.h"
36 #include "wtf/unicode/UTF8.h"
37 #include "wtf/unicode/Unicode.h"
38 
39 using namespace std;
40 
41 namespace WTF {
42 
43 using namespace Unicode;
44 using namespace std;
45 
46 // Construct a string with UTF-16 data.
String(const UChar * characters,unsigned length)47 String::String(const UChar* characters, unsigned length)
48     : m_impl(characters ? StringImpl::create(characters, length) : 0)
49 {
50 }
51 
52 // Construct a string with UTF-16 data, from a null-terminated source.
String(const UChar * str)53 String::String(const UChar* str)
54 {
55     if (!str)
56         return;
57     m_impl = StringImpl::create(str, lengthOfNullTerminatedString(str));
58 }
59 
60 // Construct a string with latin1 data.
String(const LChar * characters,unsigned length)61 String::String(const LChar* characters, unsigned length)
62     : m_impl(characters ? StringImpl::create(characters, length) : 0)
63 {
64 }
65 
String(const char * characters,unsigned length)66 String::String(const char* characters, unsigned length)
67     : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters), length) : 0)
68 {
69 }
70 
71 // Construct a string with latin1 data, from a null-terminated source.
String(const LChar * characters)72 String::String(const LChar* characters)
73     : m_impl(characters ? StringImpl::create(characters) : 0)
74 {
75 }
76 
String(const char * characters)77 String::String(const char* characters)
78     : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters)) : 0)
79 {
80 }
81 
append(const String & string)82 void String::append(const String& string)
83 {
84     if (string.isEmpty())
85         return;
86     if (!m_impl) {
87         m_impl = string.m_impl;
88         return;
89     }
90 
91     // FIXME: This is extremely inefficient. So much so that we might want to take this
92     // out of String's API. We can make it better by optimizing the case where exactly
93     // one String is pointing at this StringImpl, but even then it's going to require a
94     // call into the allocator every single time.
95 
96     if (m_impl->is8Bit() && string.m_impl->is8Bit()) {
97         LChar* data;
98         RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length());
99         RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data);
100         memcpy(data, m_impl->characters8(), m_impl->length() * sizeof(LChar));
101         memcpy(data + m_impl->length(), string.characters8(), string.length() * sizeof(LChar));
102         m_impl = newImpl.release();
103         return;
104     }
105 
106     UChar* data;
107     RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length());
108     RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data);
109 
110     if (m_impl->is8Bit())
111         StringImpl::copyChars(data, m_impl->characters8(), m_impl->length());
112     else
113         StringImpl::copyChars(data, m_impl->characters16(), m_impl->length());
114 
115     if (string.impl()->is8Bit())
116         StringImpl::copyChars(data + m_impl->length(), string.impl()->characters8(), string.impl()->length());
117     else
118         StringImpl::copyChars(data + m_impl->length(), string.impl()->characters16(), string.impl()->length());
119 
120     m_impl = newImpl.release();
121 }
122 
123 template <typename CharacterType>
appendInternal(CharacterType c)124 inline void String::appendInternal(CharacterType c)
125 {
126     // FIXME: This is extremely inefficient. So much so that we might want to take this
127     // out of String's API. We can make it better by optimizing the case where exactly
128     // one String is pointing at this StringImpl, but even then it's going to require a
129     // call into the allocator every single time.
130     if (!m_impl) {
131         m_impl = StringImpl::create(&c, 1);
132         return;
133     }
134 
135     UChar* data; // FIXME: We should be able to create an 8 bit string via this code path.
136     RELEASE_ASSERT(m_impl->length() < numeric_limits<unsigned>::max());
137     RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + 1, data);
138     if (m_impl->is8Bit())
139         StringImpl::copyChars(data, m_impl->characters8(), m_impl->length());
140     else
141         StringImpl::copyChars(data, m_impl->characters16(), m_impl->length());
142     data[m_impl->length()] = c;
143     m_impl = newImpl.release();
144 }
145 
append(LChar c)146 void String::append(LChar c)
147 {
148     appendInternal(c);
149 }
150 
append(UChar c)151 void String::append(UChar c)
152 {
153     appendInternal(c);
154 }
155 
codePointCompare(const String & a,const String & b)156 int codePointCompare(const String& a, const String& b)
157 {
158     return codePointCompare(a.impl(), b.impl());
159 }
160 
insert(const String & string,unsigned position)161 void String::insert(const String& string, unsigned position)
162 {
163     if (string.isEmpty()) {
164         if (string.isNull())
165             return;
166         if (isNull())
167             m_impl = string.impl();
168         return;
169     }
170 
171     if (string.is8Bit())
172         insert(string.impl()->characters8(), string.length(), position);
173     else
174         insert(string.impl()->characters16(), string.length(), position);
175 }
176 
append(const LChar * charactersToAppend,unsigned lengthToAppend)177 void String::append(const LChar* charactersToAppend, unsigned lengthToAppend)
178 {
179     if (!m_impl) {
180         if (!charactersToAppend)
181             return;
182         m_impl = StringImpl::create(charactersToAppend, lengthToAppend);
183         return;
184     }
185 
186     if (!lengthToAppend)
187         return;
188 
189     ASSERT(charactersToAppend);
190 
191     unsigned strLength = m_impl->length();
192 
193     if (m_impl->is8Bit()) {
194         RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
195         LChar* data;
196         RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data);
197         StringImpl::copyChars(data, m_impl->characters8(), strLength);
198         StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
199         m_impl = newImpl.release();
200         return;
201     }
202 
203     RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
204     UChar* data;
205     RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() + lengthToAppend, data);
206     StringImpl::copyChars(data, m_impl->characters16(), strLength);
207     StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
208     m_impl = newImpl.release();
209 }
210 
append(const UChar * charactersToAppend,unsigned lengthToAppend)211 void String::append(const UChar* charactersToAppend, unsigned lengthToAppend)
212 {
213     if (!m_impl) {
214         if (!charactersToAppend)
215             return;
216         m_impl = StringImpl::create(charactersToAppend, lengthToAppend);
217         return;
218     }
219 
220     if (!lengthToAppend)
221         return;
222 
223     unsigned strLength = m_impl->length();
224 
225     ASSERT(charactersToAppend);
226     RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
227     UChar* data;
228     RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data);
229     if (m_impl->is8Bit())
230         StringImpl::copyChars(data, characters8(), strLength);
231     else
232         StringImpl::copyChars(data, characters16(), strLength);
233     StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
234     m_impl = newImpl.release();
235 }
236 
237 template<typename CharType>
insertInternal(PassRefPtr<StringImpl> impl,const CharType * charactersToInsert,unsigned lengthToInsert,unsigned position)238 PassRefPtr<StringImpl> insertInternal(PassRefPtr<StringImpl> impl, const CharType* charactersToInsert, unsigned lengthToInsert, unsigned position)
239 {
240     if (!lengthToInsert)
241         return impl;
242 
243     ASSERT(charactersToInsert);
244     UChar* data; // FIXME: We should be able to create an 8 bit string here.
245     RELEASE_ASSERT(lengthToInsert <= numeric_limits<unsigned>::max() - impl->length());
246     RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(impl->length() + lengthToInsert, data);
247 
248     if (impl->is8Bit())
249         StringImpl::copyChars(data, impl->characters8(), position);
250     else
251         StringImpl::copyChars(data, impl->characters16(), position);
252 
253     StringImpl::copyChars(data + position, charactersToInsert, lengthToInsert);
254 
255     if (impl->is8Bit())
256         StringImpl::copyChars(data + position + lengthToInsert, impl->characters8() + position, impl->length() - position);
257     else
258         StringImpl::copyChars(data + position + lengthToInsert, impl->characters16() + position, impl->length() - position);
259 
260     return newImpl.release();
261 }
262 
insert(const UChar * charactersToInsert,unsigned lengthToInsert,unsigned position)263 void String::insert(const UChar* charactersToInsert, unsigned lengthToInsert, unsigned position)
264 {
265     if (position >= length()) {
266         append(charactersToInsert, lengthToInsert);
267         return;
268     }
269     ASSERT(m_impl);
270     m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position);
271 }
272 
insert(const LChar * charactersToInsert,unsigned lengthToInsert,unsigned position)273 void String::insert(const LChar* charactersToInsert, unsigned lengthToInsert, unsigned position)
274 {
275     if (position >= length()) {
276         append(charactersToInsert, lengthToInsert);
277         return;
278     }
279     ASSERT(m_impl);
280     m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position);
281 }
282 
characterStartingAt(unsigned i) const283 UChar32 String::characterStartingAt(unsigned i) const
284 {
285     if (!m_impl || i >= m_impl->length())
286         return 0;
287     return m_impl->characterStartingAt(i);
288 }
289 
ensure16Bit()290 void String::ensure16Bit()
291 {
292     unsigned length = this->length();
293     if (!length || !is8Bit())
294         return;
295     m_impl = make16BitFrom8BitSource(m_impl->characters8(), length).impl();
296 }
297 
truncate(unsigned position)298 void String::truncate(unsigned position)
299 {
300     if (position >= length())
301         return;
302     if (m_impl->is8Bit()) {
303         LChar* data;
304         RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data);
305         memcpy(data, m_impl->characters8(), position * sizeof(LChar));
306         m_impl = newImpl.release();
307     } else {
308         UChar* data;
309         RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data);
310         memcpy(data, m_impl->characters16(), position * sizeof(UChar));
311         m_impl = newImpl.release();
312     }
313 }
314 
315 template <typename CharacterType>
removeInternal(const CharacterType * characters,unsigned position,int lengthToRemove)316 inline void String::removeInternal(const CharacterType* characters, unsigned position, int lengthToRemove)
317 {
318     CharacterType* data;
319     RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() - lengthToRemove, data);
320     memcpy(data, characters, position * sizeof(CharacterType));
321     memcpy(data + position, characters + position + lengthToRemove,
322         (length() - lengthToRemove - position) * sizeof(CharacterType));
323 
324     m_impl = newImpl.release();
325 }
326 
remove(unsigned position,int lengthToRemove)327 void String::remove(unsigned position, int lengthToRemove)
328 {
329     if (lengthToRemove <= 0)
330         return;
331     if (position >= length())
332         return;
333     if (static_cast<unsigned>(lengthToRemove) > length() - position)
334         lengthToRemove = length() - position;
335 
336     if (is8Bit()) {
337         removeInternal(characters8(), position, lengthToRemove);
338 
339         return;
340     }
341 
342     removeInternal(characters16(), position, lengthToRemove);
343 }
344 
substring(unsigned pos,unsigned len) const345 String String::substring(unsigned pos, unsigned len) const
346 {
347     if (!m_impl)
348         return String();
349     return m_impl->substring(pos, len);
350 }
351 
lower() const352 String String::lower() const
353 {
354     if (!m_impl)
355         return String();
356     return m_impl->lower();
357 }
358 
upper() const359 String String::upper() const
360 {
361     if (!m_impl)
362         return String();
363     return m_impl->upper();
364 }
365 
lower(const AtomicString & localeIdentifier) const366 String String::lower(const AtomicString& localeIdentifier) const
367 {
368     if (!m_impl)
369         return String();
370     return m_impl->lower(localeIdentifier);
371 }
372 
upper(const AtomicString & localeIdentifier) const373 String String::upper(const AtomicString& localeIdentifier) const
374 {
375     if (!m_impl)
376         return String();
377     return m_impl->upper(localeIdentifier);
378 }
379 
stripWhiteSpace() const380 String String::stripWhiteSpace() const
381 {
382     if (!m_impl)
383         return String();
384     return m_impl->stripWhiteSpace();
385 }
386 
stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const387 String String::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const
388 {
389     if (!m_impl)
390         return String();
391     return m_impl->stripWhiteSpace(isWhiteSpace);
392 }
393 
simplifyWhiteSpace(StripBehavior stripBehavior) const394 String String::simplifyWhiteSpace(StripBehavior stripBehavior) const
395 {
396     if (!m_impl)
397         return String();
398     return m_impl->simplifyWhiteSpace(stripBehavior);
399 }
400 
simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace,StripBehavior stripBehavior) const401 String String::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace, StripBehavior stripBehavior) const
402 {
403     if (!m_impl)
404         return String();
405     return m_impl->simplifyWhiteSpace(isWhiteSpace, stripBehavior);
406 }
407 
removeCharacters(CharacterMatchFunctionPtr findMatch) const408 String String::removeCharacters(CharacterMatchFunctionPtr findMatch) const
409 {
410     if (!m_impl)
411         return String();
412     return m_impl->removeCharacters(findMatch);
413 }
414 
foldCase() const415 String String::foldCase() const
416 {
417     if (!m_impl)
418         return String();
419     return m_impl->foldCase();
420 }
421 
percentage(int & result) const422 bool String::percentage(int& result) const
423 {
424     if (!m_impl || !m_impl->length())
425         return false;
426 
427     if ((*m_impl)[m_impl->length() - 1] != '%')
428         return false;
429 
430     if (m_impl->is8Bit())
431         result = charactersToIntStrict(m_impl->characters8(), m_impl->length() - 1);
432     else
433         result = charactersToIntStrict(m_impl->characters16(), m_impl->length() - 1);
434 
435     return true;
436 }
437 
charactersWithNullTermination() const438 Vector<UChar> String::charactersWithNullTermination() const
439 {
440     if (!m_impl)
441         return Vector<UChar>();
442 
443     Vector<UChar> result;
444     result.reserveInitialCapacity(length() + 1);
445     appendTo(result);
446     result.append(0);
447     return result;
448 }
449 
copyTo(UChar * buffer,unsigned pos,unsigned maxLength) const450 unsigned String::copyTo(UChar* buffer, unsigned pos, unsigned maxLength) const
451 {
452     unsigned length = this->length();
453     RELEASE_ASSERT(pos <= length);
454     unsigned numCharacters = std::min(length - pos, maxLength);
455     if (!numCharacters)
456         return 0;
457     if (is8Bit())
458         StringImpl::copyChars(buffer, characters8() + pos, numCharacters);
459     else
460         StringImpl::copyChars(buffer, characters16() + pos, numCharacters);
461     return numCharacters;
462 }
463 
format(const char * format,...)464 String String::format(const char *format, ...)
465 {
466     va_list args;
467     va_start(args, format);
468 
469     Vector<char, 256> buffer;
470 
471     // Do the format once to get the length.
472 #if COMPILER(MSVC)
473     int result = _vscprintf(format, args);
474 #else
475     char ch;
476     int result = vsnprintf(&ch, 1, format, args);
477     // We need to call va_end() and then va_start() again here, as the
478     // contents of args is undefined after the call to vsnprintf
479     // according to http://man.cx/snprintf(3)
480     //
481     // Not calling va_end/va_start here happens to work on lots of
482     // systems, but fails e.g. on 64bit Linux.
483     va_end(args);
484     va_start(args, format);
485 #endif
486 
487     if (result == 0)
488         return String("");
489     if (result < 0)
490         return String();
491     unsigned len = result;
492     buffer.grow(len + 1);
493 
494     // Now do the formatting again, guaranteed to fit.
495     vsnprintf(buffer.data(), buffer.size(), format, args);
496 
497     va_end(args);
498 
499     return StringImpl::create(reinterpret_cast<const LChar*>(buffer.data()), len);
500 }
501 
number(int number)502 String String::number(int number)
503 {
504     return numberToStringSigned<String>(number);
505 }
506 
number(unsigned int number)507 String String::number(unsigned int number)
508 {
509     return numberToStringUnsigned<String>(number);
510 }
511 
number(long number)512 String String::number(long number)
513 {
514     return numberToStringSigned<String>(number);
515 }
516 
number(unsigned long number)517 String String::number(unsigned long number)
518 {
519     return numberToStringUnsigned<String>(number);
520 }
521 
number(long long number)522 String String::number(long long number)
523 {
524     return numberToStringSigned<String>(number);
525 }
526 
number(unsigned long long number)527 String String::number(unsigned long long number)
528 {
529     return numberToStringUnsigned<String>(number);
530 }
531 
number(double number,unsigned precision,TrailingZerosTruncatingPolicy trailingZerosTruncatingPolicy)532 String String::number(double number, unsigned precision, TrailingZerosTruncatingPolicy trailingZerosTruncatingPolicy)
533 {
534     NumberToStringBuffer buffer;
535     return String(numberToFixedPrecisionString(number, precision, buffer, trailingZerosTruncatingPolicy == TruncateTrailingZeros));
536 }
537 
numberToStringECMAScript(double number)538 String String::numberToStringECMAScript(double number)
539 {
540     NumberToStringBuffer buffer;
541     return String(numberToString(number, buffer));
542 }
543 
numberToStringFixedWidth(double number,unsigned decimalPlaces)544 String String::numberToStringFixedWidth(double number, unsigned decimalPlaces)
545 {
546     NumberToStringBuffer buffer;
547     return String(numberToFixedWidthString(number, decimalPlaces, buffer));
548 }
549 
toIntStrict(bool * ok,int base) const550 int String::toIntStrict(bool* ok, int base) const
551 {
552     if (!m_impl) {
553         if (ok)
554             *ok = false;
555         return 0;
556     }
557     return m_impl->toIntStrict(ok, base);
558 }
559 
toUIntStrict(bool * ok,int base) const560 unsigned String::toUIntStrict(bool* ok, int base) const
561 {
562     if (!m_impl) {
563         if (ok)
564             *ok = false;
565         return 0;
566     }
567     return m_impl->toUIntStrict(ok, base);
568 }
569 
toInt64Strict(bool * ok,int base) const570 int64_t String::toInt64Strict(bool* ok, int base) const
571 {
572     if (!m_impl) {
573         if (ok)
574             *ok = false;
575         return 0;
576     }
577     return m_impl->toInt64Strict(ok, base);
578 }
579 
toUInt64Strict(bool * ok,int base) const580 uint64_t String::toUInt64Strict(bool* ok, int base) const
581 {
582     if (!m_impl) {
583         if (ok)
584             *ok = false;
585         return 0;
586     }
587     return m_impl->toUInt64Strict(ok, base);
588 }
589 
toIntPtrStrict(bool * ok,int base) const590 intptr_t String::toIntPtrStrict(bool* ok, int base) const
591 {
592     if (!m_impl) {
593         if (ok)
594             *ok = false;
595         return 0;
596     }
597     return m_impl->toIntPtrStrict(ok, base);
598 }
599 
toInt(bool * ok) const600 int String::toInt(bool* ok) const
601 {
602     if (!m_impl) {
603         if (ok)
604             *ok = false;
605         return 0;
606     }
607     return m_impl->toInt(ok);
608 }
609 
toUInt(bool * ok) const610 unsigned String::toUInt(bool* ok) const
611 {
612     if (!m_impl) {
613         if (ok)
614             *ok = false;
615         return 0;
616     }
617     return m_impl->toUInt(ok);
618 }
619 
toInt64(bool * ok) const620 int64_t String::toInt64(bool* ok) const
621 {
622     if (!m_impl) {
623         if (ok)
624             *ok = false;
625         return 0;
626     }
627     return m_impl->toInt64(ok);
628 }
629 
toUInt64(bool * ok) const630 uint64_t String::toUInt64(bool* ok) const
631 {
632     if (!m_impl) {
633         if (ok)
634             *ok = false;
635         return 0;
636     }
637     return m_impl->toUInt64(ok);
638 }
639 
toIntPtr(bool * ok) const640 intptr_t String::toIntPtr(bool* ok) const
641 {
642     if (!m_impl) {
643         if (ok)
644             *ok = false;
645         return 0;
646     }
647     return m_impl->toIntPtr(ok);
648 }
649 
toDouble(bool * ok) const650 double String::toDouble(bool* ok) const
651 {
652     if (!m_impl) {
653         if (ok)
654             *ok = false;
655         return 0.0;
656     }
657     return m_impl->toDouble(ok);
658 }
659 
toFloat(bool * ok) const660 float String::toFloat(bool* ok) const
661 {
662     if (!m_impl) {
663         if (ok)
664             *ok = false;
665         return 0.0f;
666     }
667     return m_impl->toFloat(ok);
668 }
669 
isolatedCopy() const670 String String::isolatedCopy() const
671 {
672     if (!m_impl)
673         return String();
674     return m_impl->isolatedCopy();
675 }
676 
isSafeToSendToAnotherThread() const677 bool String::isSafeToSendToAnotherThread() const
678 {
679     if (!impl())
680         return true;
681     if (impl()->isStatic())
682         return true;
683     // AtomicStrings are not safe to send between threads as ~StringImpl()
684     // will try to remove them from the wrong AtomicStringTable.
685     if (impl()->isAtomic())
686         return false;
687     if (impl()->hasOneRef())
688         return true;
689     return false;
690 }
691 
split(const String & separator,bool allowEmptyEntries,Vector<String> & result) const692 void String::split(const String& separator, bool allowEmptyEntries, Vector<String>& result) const
693 {
694     result.clear();
695 
696     unsigned startPos = 0;
697     size_t endPos;
698     while ((endPos = find(separator, startPos)) != kNotFound) {
699         if (allowEmptyEntries || startPos != endPos)
700             result.append(substring(startPos, endPos - startPos));
701         startPos = endPos + separator.length();
702     }
703     if (allowEmptyEntries || startPos != length())
704         result.append(substring(startPos));
705 }
706 
split(UChar separator,bool allowEmptyEntries,Vector<String> & result) const707 void String::split(UChar separator, bool allowEmptyEntries, Vector<String>& result) const
708 {
709     result.clear();
710 
711     unsigned startPos = 0;
712     size_t endPos;
713     while ((endPos = find(separator, startPos)) != kNotFound) {
714         if (allowEmptyEntries || startPos != endPos)
715             result.append(substring(startPos, endPos - startPos));
716         startPos = endPos + 1;
717     }
718     if (allowEmptyEntries || startPos != length())
719         result.append(substring(startPos));
720 }
721 
ascii() const722 CString String::ascii() const
723 {
724     // Printable ASCII characters 32..127 and the null character are
725     // preserved, characters outside of this range are converted to '?'.
726 
727     unsigned length = this->length();
728     if (!length) {
729         char* characterBuffer;
730         return CString::newUninitialized(length, characterBuffer);
731     }
732 
733     if (this->is8Bit()) {
734         const LChar* characters = this->characters8();
735 
736         char* characterBuffer;
737         CString result = CString::newUninitialized(length, characterBuffer);
738 
739         for (unsigned i = 0; i < length; ++i) {
740             LChar ch = characters[i];
741             characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
742         }
743 
744         return result;
745     }
746 
747     const UChar* characters = this->characters16();
748 
749     char* characterBuffer;
750     CString result = CString::newUninitialized(length, characterBuffer);
751 
752     for (unsigned i = 0; i < length; ++i) {
753         UChar ch = characters[i];
754         characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
755     }
756 
757     return result;
758 }
759 
latin1() const760 CString String::latin1() const
761 {
762     // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
763     // preserved, characters outside of this range are converted to '?'.
764 
765     unsigned length = this->length();
766 
767     if (!length)
768         return CString("", 0);
769 
770     if (is8Bit())
771         return CString(reinterpret_cast<const char*>(this->characters8()), length);
772 
773     const UChar* characters = this->characters16();
774 
775     char* characterBuffer;
776     CString result = CString::newUninitialized(length, characterBuffer);
777 
778     for (unsigned i = 0; i < length; ++i) {
779         UChar ch = characters[i];
780         characterBuffer[i] = ch > 0xff ? '?' : ch;
781     }
782 
783     return result;
784 }
785 
786 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
putUTF8Triple(char * & buffer,UChar ch)787 static inline void putUTF8Triple(char*& buffer, UChar ch)
788 {
789     ASSERT(ch >= 0x0800);
790     *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
791     *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
792     *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
793 }
794 
utf8(ConversionMode mode) const795 CString String::utf8(ConversionMode mode) const
796 {
797     unsigned length = this->length();
798 
799     if (!length)
800         return CString("", 0);
801 
802     // Allocate a buffer big enough to hold all the characters
803     // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
804     // Optimization ideas, if we find this function is hot:
805     //  * We could speculatively create a CStringBuffer to contain 'length'
806     //    characters, and resize if necessary (i.e. if the buffer contains
807     //    non-ascii characters). (Alternatively, scan the buffer first for
808     //    ascii characters, so we know this will be sufficient).
809     //  * We could allocate a CStringBuffer with an appropriate size to
810     //    have a good chance of being able to write the string into the
811     //    buffer without reallocing (say, 1.5 x length).
812     if (length > numeric_limits<unsigned>::max() / 3)
813         return CString();
814     Vector<char, 1024> bufferVector(length * 3);
815 
816     char* buffer = bufferVector.data();
817 
818     if (is8Bit()) {
819         const LChar* characters = this->characters8();
820 
821         ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
822         ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
823     } else {
824         const UChar* characters = this->characters16();
825 
826         if (mode == StrictConversionReplacingUnpairedSurrogatesWithFFFD) {
827             const UChar* charactersEnd = characters + length;
828             char* bufferEnd = buffer + bufferVector.size();
829             while (characters < charactersEnd) {
830                 // Use strict conversion to detect unpaired surrogates.
831                 ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, &buffer, bufferEnd, true);
832                 ASSERT(result != targetExhausted);
833                 // Conversion fails when there is an unpaired surrogate.
834                 // Put replacement character (U+FFFD) instead of the unpaired surrogate.
835                 if (result != conversionOK) {
836                     ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
837                     // There should be room left, since one UChar hasn't been converted.
838                     ASSERT((buffer + 3) <= bufferEnd);
839                     putUTF8Triple(buffer, replacementCharacter);
840                     ++characters;
841                 }
842             }
843         } else {
844             bool strict = mode == StrictConversion;
845             ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
846             ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
847 
848             // Only produced from strict conversion.
849             if (result == sourceIllegal) {
850                 ASSERT(strict);
851                 return CString();
852             }
853 
854             // Check for an unconverted high surrogate.
855             if (result == sourceExhausted) {
856                 if (strict)
857                     return CString();
858                 // This should be one unpaired high surrogate. Treat it the same
859                 // was as an unpaired high surrogate would have been handled in
860                 // the middle of a string with non-strict conversion - which is
861                 // to say, simply encode it to UTF-8.
862                 ASSERT((characters + 1) == (this->characters16() + length));
863                 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
864                 // There should be room left, since one UChar hasn't been converted.
865                 ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
866                 putUTF8Triple(buffer, *characters);
867             }
868         }
869     }
870 
871     return CString(bufferVector.data(), buffer - bufferVector.data());
872 }
873 
make8BitFrom16BitSource(const UChar * source,size_t length)874 String String::make8BitFrom16BitSource(const UChar* source, size_t length)
875 {
876     if (!length)
877         return String();
878 
879     LChar* destination;
880     String result = String::createUninitialized(length, destination);
881 
882     copyLCharsFromUCharSource(destination, source, length);
883 
884     return result;
885 }
886 
make16BitFrom8BitSource(const LChar * source,size_t length)887 String String::make16BitFrom8BitSource(const LChar* source, size_t length)
888 {
889     if (!length)
890         return String();
891 
892     UChar* destination;
893     String result = String::createUninitialized(length, destination);
894 
895     StringImpl::copyChars(destination, source, length);
896 
897     return result;
898 }
899 
fromUTF8(const LChar * stringStart,size_t length)900 String String::fromUTF8(const LChar* stringStart, size_t length)
901 {
902     RELEASE_ASSERT(length <= numeric_limits<unsigned>::max());
903 
904     if (!stringStart)
905         return String();
906 
907     if (!length)
908         return emptyString();
909 
910     if (charactersAreAllASCII(stringStart, length))
911         return StringImpl::create(stringStart, length);
912 
913     Vector<UChar, 1024> buffer(length);
914     UChar* bufferStart = buffer.data();
915 
916     UChar* bufferCurrent = bufferStart;
917     const char* stringCurrent = reinterpret_cast<const char*>(stringStart);
918     if (convertUTF8ToUTF16(&stringCurrent, reinterpret_cast<const char *>(stringStart + length), &bufferCurrent, bufferCurrent + buffer.size()) != conversionOK)
919         return String();
920 
921     unsigned utf16Length = bufferCurrent - bufferStart;
922     ASSERT(utf16Length < length);
923     return StringImpl::create(bufferStart, utf16Length);
924 }
925 
fromUTF8(const LChar * string)926 String String::fromUTF8(const LChar* string)
927 {
928     if (!string)
929         return String();
930     return fromUTF8(string, strlen(reinterpret_cast<const char*>(string)));
931 }
932 
fromUTF8(const CString & s)933 String String::fromUTF8(const CString& s)
934 {
935     return fromUTF8(s.data());
936 }
937 
fromUTF8WithLatin1Fallback(const LChar * string,size_t size)938 String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size)
939 {
940     String utf8 = fromUTF8(string, size);
941     if (!utf8)
942         return String(string, size);
943     return utf8;
944 }
945 
946 // String Operations
947 
isCharacterAllowedInBase(UChar c,int base)948 static bool isCharacterAllowedInBase(UChar c, int base)
949 {
950     if (c > 0x7F)
951         return false;
952     if (isASCIIDigit(c))
953         return c - '0' < base;
954     if (isASCIIAlpha(c)) {
955         if (base > 36)
956             base = 36;
957         return (c >= 'a' && c < 'a' + base - 10)
958             || (c >= 'A' && c < 'A' + base - 10);
959     }
960     return false;
961 }
962 
963 template <typename IntegralType, typename CharType>
toIntegralType(const CharType * data,size_t length,bool * ok,int base)964 static inline IntegralType toIntegralType(const CharType* data, size_t length, bool* ok, int base)
965 {
966     static const IntegralType integralMax = numeric_limits<IntegralType>::max();
967     static const bool isSigned = numeric_limits<IntegralType>::is_signed;
968     const IntegralType maxMultiplier = integralMax / base;
969 
970     IntegralType value = 0;
971     bool isOk = false;
972     bool isNegative = false;
973 
974     if (!data)
975         goto bye;
976 
977     // skip leading whitespace
978     while (length && isSpaceOrNewline(*data)) {
979         --length;
980         ++data;
981     }
982 
983     if (isSigned && length && *data == '-') {
984         --length;
985         ++data;
986         isNegative = true;
987     } else if (length && *data == '+') {
988         --length;
989         ++data;
990     }
991 
992     if (!length || !isCharacterAllowedInBase(*data, base))
993         goto bye;
994 
995     while (length && isCharacterAllowedInBase(*data, base)) {
996         --length;
997         IntegralType digitValue;
998         CharType c = *data;
999         if (isASCIIDigit(c))
1000             digitValue = c - '0';
1001         else if (c >= 'a')
1002             digitValue = c - 'a' + 10;
1003         else
1004             digitValue = c - 'A' + 10;
1005 
1006         if (value > maxMultiplier || (value == maxMultiplier && digitValue > (integralMax % base) + isNegative))
1007             goto bye;
1008 
1009         value = base * value + digitValue;
1010         ++data;
1011     }
1012 
1013 #if COMPILER(MSVC)
1014 #pragma warning(push, 0)
1015 #pragma warning(disable:4146)
1016 #endif
1017 
1018     if (isNegative)
1019         value = -value;
1020 
1021 #if COMPILER(MSVC)
1022 #pragma warning(pop)
1023 #endif
1024 
1025     // skip trailing space
1026     while (length && isSpaceOrNewline(*data)) {
1027         --length;
1028         ++data;
1029     }
1030 
1031     if (!length)
1032         isOk = true;
1033 bye:
1034     if (ok)
1035         *ok = isOk;
1036     return isOk ? value : 0;
1037 }
1038 
1039 template <typename CharType>
lengthOfCharactersAsInteger(const CharType * data,size_t length)1040 static unsigned lengthOfCharactersAsInteger(const CharType* data, size_t length)
1041 {
1042     size_t i = 0;
1043 
1044     // Allow leading spaces.
1045     for (; i != length; ++i) {
1046         if (!isSpaceOrNewline(data[i]))
1047             break;
1048     }
1049 
1050     // Allow sign.
1051     if (i != length && (data[i] == '+' || data[i] == '-'))
1052         ++i;
1053 
1054     // Allow digits.
1055     for (; i != length; ++i) {
1056         if (!isASCIIDigit(data[i]))
1057             break;
1058     }
1059 
1060     return i;
1061 }
1062 
charactersToIntStrict(const LChar * data,size_t length,bool * ok,int base)1063 int charactersToIntStrict(const LChar* data, size_t length, bool* ok, int base)
1064 {
1065     return toIntegralType<int, LChar>(data, length, ok, base);
1066 }
1067 
charactersToIntStrict(const UChar * data,size_t length,bool * ok,int base)1068 int charactersToIntStrict(const UChar* data, size_t length, bool* ok, int base)
1069 {
1070     return toIntegralType<int, UChar>(data, length, ok, base);
1071 }
1072 
charactersToUIntStrict(const LChar * data,size_t length,bool * ok,int base)1073 unsigned charactersToUIntStrict(const LChar* data, size_t length, bool* ok, int base)
1074 {
1075     return toIntegralType<unsigned, LChar>(data, length, ok, base);
1076 }
1077 
charactersToUIntStrict(const UChar * data,size_t length,bool * ok,int base)1078 unsigned charactersToUIntStrict(const UChar* data, size_t length, bool* ok, int base)
1079 {
1080     return toIntegralType<unsigned, UChar>(data, length, ok, base);
1081 }
1082 
charactersToInt64Strict(const LChar * data,size_t length,bool * ok,int base)1083 int64_t charactersToInt64Strict(const LChar* data, size_t length, bool* ok, int base)
1084 {
1085     return toIntegralType<int64_t, LChar>(data, length, ok, base);
1086 }
1087 
charactersToInt64Strict(const UChar * data,size_t length,bool * ok,int base)1088 int64_t charactersToInt64Strict(const UChar* data, size_t length, bool* ok, int base)
1089 {
1090     return toIntegralType<int64_t, UChar>(data, length, ok, base);
1091 }
1092 
charactersToUInt64Strict(const LChar * data,size_t length,bool * ok,int base)1093 uint64_t charactersToUInt64Strict(const LChar* data, size_t length, bool* ok, int base)
1094 {
1095     return toIntegralType<uint64_t, LChar>(data, length, ok, base);
1096 }
1097 
charactersToUInt64Strict(const UChar * data,size_t length,bool * ok,int base)1098 uint64_t charactersToUInt64Strict(const UChar* data, size_t length, bool* ok, int base)
1099 {
1100     return toIntegralType<uint64_t, UChar>(data, length, ok, base);
1101 }
1102 
charactersToIntPtrStrict(const LChar * data,size_t length,bool * ok,int base)1103 intptr_t charactersToIntPtrStrict(const LChar* data, size_t length, bool* ok, int base)
1104 {
1105     return toIntegralType<intptr_t, LChar>(data, length, ok, base);
1106 }
1107 
charactersToIntPtrStrict(const UChar * data,size_t length,bool * ok,int base)1108 intptr_t charactersToIntPtrStrict(const UChar* data, size_t length, bool* ok, int base)
1109 {
1110     return toIntegralType<intptr_t, UChar>(data, length, ok, base);
1111 }
1112 
charactersToInt(const LChar * data,size_t length,bool * ok)1113 int charactersToInt(const LChar* data, size_t length, bool* ok)
1114 {
1115     return toIntegralType<int, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1116 }
1117 
charactersToInt(const UChar * data,size_t length,bool * ok)1118 int charactersToInt(const UChar* data, size_t length, bool* ok)
1119 {
1120     return toIntegralType<int, UChar>(data, lengthOfCharactersAsInteger(data, length), ok, 10);
1121 }
1122 
charactersToUInt(const LChar * data,size_t length,bool * ok)1123 unsigned charactersToUInt(const LChar* data, size_t length, bool* ok)
1124 {
1125     return toIntegralType<unsigned, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1126 }
1127 
charactersToUInt(const UChar * data,size_t length,bool * ok)1128 unsigned charactersToUInt(const UChar* data, size_t length, bool* ok)
1129 {
1130     return toIntegralType<unsigned, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1131 }
1132 
charactersToInt64(const LChar * data,size_t length,bool * ok)1133 int64_t charactersToInt64(const LChar* data, size_t length, bool* ok)
1134 {
1135     return toIntegralType<int64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1136 }
1137 
charactersToInt64(const UChar * data,size_t length,bool * ok)1138 int64_t charactersToInt64(const UChar* data, size_t length, bool* ok)
1139 {
1140     return toIntegralType<int64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1141 }
1142 
charactersToUInt64(const LChar * data,size_t length,bool * ok)1143 uint64_t charactersToUInt64(const LChar* data, size_t length, bool* ok)
1144 {
1145     return toIntegralType<uint64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1146 }
1147 
charactersToUInt64(const UChar * data,size_t length,bool * ok)1148 uint64_t charactersToUInt64(const UChar* data, size_t length, bool* ok)
1149 {
1150     return toIntegralType<uint64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1151 }
1152 
charactersToIntPtr(const LChar * data,size_t length,bool * ok)1153 intptr_t charactersToIntPtr(const LChar* data, size_t length, bool* ok)
1154 {
1155     return toIntegralType<intptr_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1156 }
1157 
charactersToIntPtr(const UChar * data,size_t length,bool * ok)1158 intptr_t charactersToIntPtr(const UChar* data, size_t length, bool* ok)
1159 {
1160     return toIntegralType<intptr_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1161 }
1162 
1163 enum TrailingJunkPolicy { DisallowTrailingJunk, AllowTrailingJunk };
1164 
1165 template <typename CharType, TrailingJunkPolicy policy>
toDoubleType(const CharType * data,size_t length,bool * ok,size_t & parsedLength)1166 static inline double toDoubleType(const CharType* data, size_t length, bool* ok, size_t& parsedLength)
1167 {
1168     size_t leadingSpacesLength = 0;
1169     while (leadingSpacesLength < length && isASCIISpace(data[leadingSpacesLength]))
1170         ++leadingSpacesLength;
1171 
1172     double number = parseDouble(data + leadingSpacesLength, length - leadingSpacesLength, parsedLength);
1173     if (!parsedLength) {
1174         if (ok)
1175             *ok = false;
1176         return 0.0;
1177     }
1178 
1179     parsedLength += leadingSpacesLength;
1180     if (ok)
1181         *ok = policy == AllowTrailingJunk || parsedLength == length;
1182     return number;
1183 }
1184 
charactersToDouble(const LChar * data,size_t length,bool * ok)1185 double charactersToDouble(const LChar* data, size_t length, bool* ok)
1186 {
1187     size_t parsedLength;
1188     return toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength);
1189 }
1190 
charactersToDouble(const UChar * data,size_t length,bool * ok)1191 double charactersToDouble(const UChar* data, size_t length, bool* ok)
1192 {
1193     size_t parsedLength;
1194     return toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength);
1195 }
1196 
charactersToFloat(const LChar * data,size_t length,bool * ok)1197 float charactersToFloat(const LChar* data, size_t length, bool* ok)
1198 {
1199     // FIXME: This will return ok even when the string fits into a double but not a float.
1200     size_t parsedLength;
1201     return static_cast<float>(toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength));
1202 }
1203 
charactersToFloat(const UChar * data,size_t length,bool * ok)1204 float charactersToFloat(const UChar* data, size_t length, bool* ok)
1205 {
1206     // FIXME: This will return ok even when the string fits into a double but not a float.
1207     size_t parsedLength;
1208     return static_cast<float>(toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength));
1209 }
1210 
charactersToFloat(const LChar * data,size_t length,size_t & parsedLength)1211 float charactersToFloat(const LChar* data, size_t length, size_t& parsedLength)
1212 {
1213     // FIXME: This will return ok even when the string fits into a double but not a float.
1214     return static_cast<float>(toDoubleType<LChar, AllowTrailingJunk>(data, length, 0, parsedLength));
1215 }
1216 
charactersToFloat(const UChar * data,size_t length,size_t & parsedLength)1217 float charactersToFloat(const UChar* data, size_t length, size_t& parsedLength)
1218 {
1219     // FIXME: This will return ok even when the string fits into a double but not a float.
1220     return static_cast<float>(toDoubleType<UChar, AllowTrailingJunk>(data, length, 0, parsedLength));
1221 }
1222 
emptyString()1223 const String& emptyString()
1224 {
1225     DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty()));
1226     return emptyString;
1227 }
1228 
1229 } // namespace WTF
1230 
1231 #ifndef NDEBUG
1232 // For use in the debugger
1233 String* string(const char*);
1234 Vector<char> asciiDebug(StringImpl* impl);
1235 Vector<char> asciiDebug(String& string);
1236 
show() const1237 void String::show() const
1238 {
1239     dataLogF("%s\n", asciiDebug(impl()).data());
1240 }
1241 
string(const char * s)1242 String* string(const char* s)
1243 {
1244     // leaks memory!
1245     return new String(s);
1246 }
1247 
asciiDebug(StringImpl * impl)1248 Vector<char> asciiDebug(StringImpl* impl)
1249 {
1250     if (!impl)
1251         return asciiDebug(String("[null]").impl());
1252 
1253     Vector<char> buffer;
1254     for (unsigned i = 0; i < impl->length(); ++i) {
1255         UChar ch = (*impl)[i];
1256         if (isASCIIPrintable(ch)) {
1257             if (ch == '\\')
1258                 buffer.append(ch);
1259             buffer.append(ch);
1260         } else {
1261             buffer.append('\\');
1262             buffer.append('u');
1263             appendUnsignedAsHexFixedSize(ch, buffer, 4);
1264         }
1265     }
1266     buffer.append('\0');
1267     return buffer;
1268 }
1269 
asciiDebug(String & string)1270 Vector<char> asciiDebug(String& string)
1271 {
1272     return asciiDebug(string.impl());
1273 }
1274 
1275 #endif
1276