1 /*
2 * (C) 1999 Lars Knoll (knoll@kde.org)
3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights reserved.
4 * Copyright (C) 2007-2009 Torch Mobile, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Library General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Library General Public License for more details.
15 *
16 * You should have received a copy of the GNU Library General Public License
17 * along with this library; see the file COPYING.LIB. If not, write to
18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 * Boston, MA 02110-1301, USA.
20 */
21
22 #include "config.h"
23 #include "WTFString.h"
24
25 #include "IntegerToStringConversion.h"
26 #include <stdarg.h>
27 #include "wtf/ASCIICType.h"
28 #include "wtf/DataLog.h"
29 #include "wtf/HexNumber.h"
30 #include "wtf/MathExtras.h"
31 #include "wtf/text/CString.h"
32 #include "wtf/StringExtras.h"
33 #include "wtf/Vector.h"
34 #include "wtf/dtoa.h"
35 #include "wtf/unicode/CharacterNames.h"
36 #include "wtf/unicode/UTF8.h"
37 #include "wtf/unicode/Unicode.h"
38
39 using namespace std;
40
41 namespace WTF {
42
43 using namespace Unicode;
44 using namespace std;
45
46 // Construct a string with UTF-16 data.
String(const UChar * characters,unsigned length)47 String::String(const UChar* characters, unsigned length)
48 : m_impl(characters ? StringImpl::create(characters, length) : 0)
49 {
50 }
51
52 // Construct a string with UTF-16 data, from a null-terminated source.
String(const UChar * str)53 String::String(const UChar* str)
54 {
55 if (!str)
56 return;
57 m_impl = StringImpl::create(str, lengthOfNullTerminatedString(str));
58 }
59
60 // Construct a string with latin1 data.
String(const LChar * characters,unsigned length)61 String::String(const LChar* characters, unsigned length)
62 : m_impl(characters ? StringImpl::create(characters, length) : 0)
63 {
64 }
65
String(const char * characters,unsigned length)66 String::String(const char* characters, unsigned length)
67 : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters), length) : 0)
68 {
69 }
70
71 // Construct a string with latin1 data, from a null-terminated source.
String(const LChar * characters)72 String::String(const LChar* characters)
73 : m_impl(characters ? StringImpl::create(characters) : 0)
74 {
75 }
76
String(const char * characters)77 String::String(const char* characters)
78 : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters)) : 0)
79 {
80 }
81
append(const String & string)82 void String::append(const String& string)
83 {
84 if (string.isEmpty())
85 return;
86 if (!m_impl) {
87 m_impl = string.m_impl;
88 return;
89 }
90
91 // FIXME: This is extremely inefficient. So much so that we might want to take this
92 // out of String's API. We can make it better by optimizing the case where exactly
93 // one String is pointing at this StringImpl, but even then it's going to require a
94 // call into the allocator every single time.
95
96 if (m_impl->is8Bit() && string.m_impl->is8Bit()) {
97 LChar* data;
98 RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length());
99 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data);
100 memcpy(data, m_impl->characters8(), m_impl->length() * sizeof(LChar));
101 memcpy(data + m_impl->length(), string.characters8(), string.length() * sizeof(LChar));
102 m_impl = newImpl.release();
103 return;
104 }
105
106 UChar* data;
107 RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length());
108 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data);
109
110 if (m_impl->is8Bit())
111 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length());
112 else
113 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length());
114
115 if (string.impl()->is8Bit())
116 StringImpl::copyChars(data + m_impl->length(), string.impl()->characters8(), string.impl()->length());
117 else
118 StringImpl::copyChars(data + m_impl->length(), string.impl()->characters16(), string.impl()->length());
119
120 m_impl = newImpl.release();
121 }
122
123 template <typename CharacterType>
appendInternal(CharacterType c)124 inline void String::appendInternal(CharacterType c)
125 {
126 // FIXME: This is extremely inefficient. So much so that we might want to take this
127 // out of String's API. We can make it better by optimizing the case where exactly
128 // one String is pointing at this StringImpl, but even then it's going to require a
129 // call into the allocator every single time.
130 if (!m_impl) {
131 m_impl = StringImpl::create(&c, 1);
132 return;
133 }
134
135 UChar* data; // FIXME: We should be able to create an 8 bit string via this code path.
136 RELEASE_ASSERT(m_impl->length() < numeric_limits<unsigned>::max());
137 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + 1, data);
138 if (m_impl->is8Bit())
139 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length());
140 else
141 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length());
142 data[m_impl->length()] = c;
143 m_impl = newImpl.release();
144 }
145
append(LChar c)146 void String::append(LChar c)
147 {
148 appendInternal(c);
149 }
150
append(UChar c)151 void String::append(UChar c)
152 {
153 appendInternal(c);
154 }
155
codePointCompare(const String & a,const String & b)156 int codePointCompare(const String& a, const String& b)
157 {
158 return codePointCompare(a.impl(), b.impl());
159 }
160
insert(const String & string,unsigned position)161 void String::insert(const String& string, unsigned position)
162 {
163 if (string.isEmpty()) {
164 if (string.isNull())
165 return;
166 if (isNull())
167 m_impl = string.impl();
168 return;
169 }
170
171 if (string.is8Bit())
172 insert(string.impl()->characters8(), string.length(), position);
173 else
174 insert(string.impl()->characters16(), string.length(), position);
175 }
176
append(const LChar * charactersToAppend,unsigned lengthToAppend)177 void String::append(const LChar* charactersToAppend, unsigned lengthToAppend)
178 {
179 if (!m_impl) {
180 if (!charactersToAppend)
181 return;
182 m_impl = StringImpl::create(charactersToAppend, lengthToAppend);
183 return;
184 }
185
186 if (!lengthToAppend)
187 return;
188
189 ASSERT(charactersToAppend);
190
191 unsigned strLength = m_impl->length();
192
193 if (m_impl->is8Bit()) {
194 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
195 LChar* data;
196 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data);
197 StringImpl::copyChars(data, m_impl->characters8(), strLength);
198 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
199 m_impl = newImpl.release();
200 return;
201 }
202
203 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
204 UChar* data;
205 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() + lengthToAppend, data);
206 StringImpl::copyChars(data, m_impl->characters16(), strLength);
207 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
208 m_impl = newImpl.release();
209 }
210
append(const UChar * charactersToAppend,unsigned lengthToAppend)211 void String::append(const UChar* charactersToAppend, unsigned lengthToAppend)
212 {
213 if (!m_impl) {
214 if (!charactersToAppend)
215 return;
216 m_impl = StringImpl::create(charactersToAppend, lengthToAppend);
217 return;
218 }
219
220 if (!lengthToAppend)
221 return;
222
223 unsigned strLength = m_impl->length();
224
225 ASSERT(charactersToAppend);
226 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength);
227 UChar* data;
228 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data);
229 if (m_impl->is8Bit())
230 StringImpl::copyChars(data, characters8(), strLength);
231 else
232 StringImpl::copyChars(data, characters16(), strLength);
233 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend);
234 m_impl = newImpl.release();
235 }
236
237 template<typename CharType>
insertInternal(PassRefPtr<StringImpl> impl,const CharType * charactersToInsert,unsigned lengthToInsert,unsigned position)238 PassRefPtr<StringImpl> insertInternal(PassRefPtr<StringImpl> impl, const CharType* charactersToInsert, unsigned lengthToInsert, unsigned position)
239 {
240 if (!lengthToInsert)
241 return impl;
242
243 ASSERT(charactersToInsert);
244 UChar* data; // FIXME: We should be able to create an 8 bit string here.
245 RELEASE_ASSERT(lengthToInsert <= numeric_limits<unsigned>::max() - impl->length());
246 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(impl->length() + lengthToInsert, data);
247
248 if (impl->is8Bit())
249 StringImpl::copyChars(data, impl->characters8(), position);
250 else
251 StringImpl::copyChars(data, impl->characters16(), position);
252
253 StringImpl::copyChars(data + position, charactersToInsert, lengthToInsert);
254
255 if (impl->is8Bit())
256 StringImpl::copyChars(data + position + lengthToInsert, impl->characters8() + position, impl->length() - position);
257 else
258 StringImpl::copyChars(data + position + lengthToInsert, impl->characters16() + position, impl->length() - position);
259
260 return newImpl.release();
261 }
262
insert(const UChar * charactersToInsert,unsigned lengthToInsert,unsigned position)263 void String::insert(const UChar* charactersToInsert, unsigned lengthToInsert, unsigned position)
264 {
265 if (position >= length()) {
266 append(charactersToInsert, lengthToInsert);
267 return;
268 }
269 ASSERT(m_impl);
270 m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position);
271 }
272
insert(const LChar * charactersToInsert,unsigned lengthToInsert,unsigned position)273 void String::insert(const LChar* charactersToInsert, unsigned lengthToInsert, unsigned position)
274 {
275 if (position >= length()) {
276 append(charactersToInsert, lengthToInsert);
277 return;
278 }
279 ASSERT(m_impl);
280 m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position);
281 }
282
characterStartingAt(unsigned i) const283 UChar32 String::characterStartingAt(unsigned i) const
284 {
285 if (!m_impl || i >= m_impl->length())
286 return 0;
287 return m_impl->characterStartingAt(i);
288 }
289
ensure16Bit()290 void String::ensure16Bit()
291 {
292 unsigned length = this->length();
293 if (!length || !is8Bit())
294 return;
295 m_impl = make16BitFrom8BitSource(m_impl->characters8(), length).impl();
296 }
297
truncate(unsigned position)298 void String::truncate(unsigned position)
299 {
300 if (position >= length())
301 return;
302 if (m_impl->is8Bit()) {
303 LChar* data;
304 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data);
305 memcpy(data, m_impl->characters8(), position * sizeof(LChar));
306 m_impl = newImpl.release();
307 } else {
308 UChar* data;
309 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data);
310 memcpy(data, m_impl->characters16(), position * sizeof(UChar));
311 m_impl = newImpl.release();
312 }
313 }
314
315 template <typename CharacterType>
removeInternal(const CharacterType * characters,unsigned position,int lengthToRemove)316 inline void String::removeInternal(const CharacterType* characters, unsigned position, int lengthToRemove)
317 {
318 CharacterType* data;
319 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() - lengthToRemove, data);
320 memcpy(data, characters, position * sizeof(CharacterType));
321 memcpy(data + position, characters + position + lengthToRemove,
322 (length() - lengthToRemove - position) * sizeof(CharacterType));
323
324 m_impl = newImpl.release();
325 }
326
remove(unsigned position,int lengthToRemove)327 void String::remove(unsigned position, int lengthToRemove)
328 {
329 if (lengthToRemove <= 0)
330 return;
331 if (position >= length())
332 return;
333 if (static_cast<unsigned>(lengthToRemove) > length() - position)
334 lengthToRemove = length() - position;
335
336 if (is8Bit()) {
337 removeInternal(characters8(), position, lengthToRemove);
338
339 return;
340 }
341
342 removeInternal(characters16(), position, lengthToRemove);
343 }
344
substring(unsigned pos,unsigned len) const345 String String::substring(unsigned pos, unsigned len) const
346 {
347 if (!m_impl)
348 return String();
349 return m_impl->substring(pos, len);
350 }
351
lower() const352 String String::lower() const
353 {
354 if (!m_impl)
355 return String();
356 return m_impl->lower();
357 }
358
upper() const359 String String::upper() const
360 {
361 if (!m_impl)
362 return String();
363 return m_impl->upper();
364 }
365
lower(const AtomicString & localeIdentifier) const366 String String::lower(const AtomicString& localeIdentifier) const
367 {
368 if (!m_impl)
369 return String();
370 return m_impl->lower(localeIdentifier);
371 }
372
upper(const AtomicString & localeIdentifier) const373 String String::upper(const AtomicString& localeIdentifier) const
374 {
375 if (!m_impl)
376 return String();
377 return m_impl->upper(localeIdentifier);
378 }
379
stripWhiteSpace() const380 String String::stripWhiteSpace() const
381 {
382 if (!m_impl)
383 return String();
384 return m_impl->stripWhiteSpace();
385 }
386
stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const387 String String::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const
388 {
389 if (!m_impl)
390 return String();
391 return m_impl->stripWhiteSpace(isWhiteSpace);
392 }
393
simplifyWhiteSpace(StripBehavior stripBehavior) const394 String String::simplifyWhiteSpace(StripBehavior stripBehavior) const
395 {
396 if (!m_impl)
397 return String();
398 return m_impl->simplifyWhiteSpace(stripBehavior);
399 }
400
simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace,StripBehavior stripBehavior) const401 String String::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace, StripBehavior stripBehavior) const
402 {
403 if (!m_impl)
404 return String();
405 return m_impl->simplifyWhiteSpace(isWhiteSpace, stripBehavior);
406 }
407
removeCharacters(CharacterMatchFunctionPtr findMatch) const408 String String::removeCharacters(CharacterMatchFunctionPtr findMatch) const
409 {
410 if (!m_impl)
411 return String();
412 return m_impl->removeCharacters(findMatch);
413 }
414
foldCase() const415 String String::foldCase() const
416 {
417 if (!m_impl)
418 return String();
419 return m_impl->foldCase();
420 }
421
percentage(int & result) const422 bool String::percentage(int& result) const
423 {
424 if (!m_impl || !m_impl->length())
425 return false;
426
427 if ((*m_impl)[m_impl->length() - 1] != '%')
428 return false;
429
430 if (m_impl->is8Bit())
431 result = charactersToIntStrict(m_impl->characters8(), m_impl->length() - 1);
432 else
433 result = charactersToIntStrict(m_impl->characters16(), m_impl->length() - 1);
434
435 return true;
436 }
437
charactersWithNullTermination() const438 Vector<UChar> String::charactersWithNullTermination() const
439 {
440 if (!m_impl)
441 return Vector<UChar>();
442
443 Vector<UChar> result;
444 result.reserveInitialCapacity(length() + 1);
445 appendTo(result);
446 result.append(0);
447 return result;
448 }
449
copyTo(UChar * buffer,unsigned pos,unsigned maxLength) const450 unsigned String::copyTo(UChar* buffer, unsigned pos, unsigned maxLength) const
451 {
452 unsigned length = this->length();
453 RELEASE_ASSERT(pos <= length);
454 unsigned numCharacters = std::min(length - pos, maxLength);
455 if (!numCharacters)
456 return 0;
457 if (is8Bit())
458 StringImpl::copyChars(buffer, characters8() + pos, numCharacters);
459 else
460 StringImpl::copyChars(buffer, characters16() + pos, numCharacters);
461 return numCharacters;
462 }
463
format(const char * format,...)464 String String::format(const char *format, ...)
465 {
466 va_list args;
467 va_start(args, format);
468
469 Vector<char, 256> buffer;
470
471 // Do the format once to get the length.
472 #if COMPILER(MSVC)
473 int result = _vscprintf(format, args);
474 #else
475 char ch;
476 int result = vsnprintf(&ch, 1, format, args);
477 // We need to call va_end() and then va_start() again here, as the
478 // contents of args is undefined after the call to vsnprintf
479 // according to http://man.cx/snprintf(3)
480 //
481 // Not calling va_end/va_start here happens to work on lots of
482 // systems, but fails e.g. on 64bit Linux.
483 va_end(args);
484 va_start(args, format);
485 #endif
486
487 if (result == 0)
488 return String("");
489 if (result < 0)
490 return String();
491 unsigned len = result;
492 buffer.grow(len + 1);
493
494 // Now do the formatting again, guaranteed to fit.
495 vsnprintf(buffer.data(), buffer.size(), format, args);
496
497 va_end(args);
498
499 return StringImpl::create(reinterpret_cast<const LChar*>(buffer.data()), len);
500 }
501
number(int number)502 String String::number(int number)
503 {
504 return numberToStringSigned<String>(number);
505 }
506
number(unsigned int number)507 String String::number(unsigned int number)
508 {
509 return numberToStringUnsigned<String>(number);
510 }
511
number(long number)512 String String::number(long number)
513 {
514 return numberToStringSigned<String>(number);
515 }
516
number(unsigned long number)517 String String::number(unsigned long number)
518 {
519 return numberToStringUnsigned<String>(number);
520 }
521
number(long long number)522 String String::number(long long number)
523 {
524 return numberToStringSigned<String>(number);
525 }
526
number(unsigned long long number)527 String String::number(unsigned long long number)
528 {
529 return numberToStringUnsigned<String>(number);
530 }
531
number(double number,unsigned precision,TrailingZerosTruncatingPolicy trailingZerosTruncatingPolicy)532 String String::number(double number, unsigned precision, TrailingZerosTruncatingPolicy trailingZerosTruncatingPolicy)
533 {
534 NumberToStringBuffer buffer;
535 return String(numberToFixedPrecisionString(number, precision, buffer, trailingZerosTruncatingPolicy == TruncateTrailingZeros));
536 }
537
numberToStringECMAScript(double number)538 String String::numberToStringECMAScript(double number)
539 {
540 NumberToStringBuffer buffer;
541 return String(numberToString(number, buffer));
542 }
543
numberToStringFixedWidth(double number,unsigned decimalPlaces)544 String String::numberToStringFixedWidth(double number, unsigned decimalPlaces)
545 {
546 NumberToStringBuffer buffer;
547 return String(numberToFixedWidthString(number, decimalPlaces, buffer));
548 }
549
toIntStrict(bool * ok,int base) const550 int String::toIntStrict(bool* ok, int base) const
551 {
552 if (!m_impl) {
553 if (ok)
554 *ok = false;
555 return 0;
556 }
557 return m_impl->toIntStrict(ok, base);
558 }
559
toUIntStrict(bool * ok,int base) const560 unsigned String::toUIntStrict(bool* ok, int base) const
561 {
562 if (!m_impl) {
563 if (ok)
564 *ok = false;
565 return 0;
566 }
567 return m_impl->toUIntStrict(ok, base);
568 }
569
toInt64Strict(bool * ok,int base) const570 int64_t String::toInt64Strict(bool* ok, int base) const
571 {
572 if (!m_impl) {
573 if (ok)
574 *ok = false;
575 return 0;
576 }
577 return m_impl->toInt64Strict(ok, base);
578 }
579
toUInt64Strict(bool * ok,int base) const580 uint64_t String::toUInt64Strict(bool* ok, int base) const
581 {
582 if (!m_impl) {
583 if (ok)
584 *ok = false;
585 return 0;
586 }
587 return m_impl->toUInt64Strict(ok, base);
588 }
589
toIntPtrStrict(bool * ok,int base) const590 intptr_t String::toIntPtrStrict(bool* ok, int base) const
591 {
592 if (!m_impl) {
593 if (ok)
594 *ok = false;
595 return 0;
596 }
597 return m_impl->toIntPtrStrict(ok, base);
598 }
599
toInt(bool * ok) const600 int String::toInt(bool* ok) const
601 {
602 if (!m_impl) {
603 if (ok)
604 *ok = false;
605 return 0;
606 }
607 return m_impl->toInt(ok);
608 }
609
toUInt(bool * ok) const610 unsigned String::toUInt(bool* ok) const
611 {
612 if (!m_impl) {
613 if (ok)
614 *ok = false;
615 return 0;
616 }
617 return m_impl->toUInt(ok);
618 }
619
toInt64(bool * ok) const620 int64_t String::toInt64(bool* ok) const
621 {
622 if (!m_impl) {
623 if (ok)
624 *ok = false;
625 return 0;
626 }
627 return m_impl->toInt64(ok);
628 }
629
toUInt64(bool * ok) const630 uint64_t String::toUInt64(bool* ok) const
631 {
632 if (!m_impl) {
633 if (ok)
634 *ok = false;
635 return 0;
636 }
637 return m_impl->toUInt64(ok);
638 }
639
toIntPtr(bool * ok) const640 intptr_t String::toIntPtr(bool* ok) const
641 {
642 if (!m_impl) {
643 if (ok)
644 *ok = false;
645 return 0;
646 }
647 return m_impl->toIntPtr(ok);
648 }
649
toDouble(bool * ok) const650 double String::toDouble(bool* ok) const
651 {
652 if (!m_impl) {
653 if (ok)
654 *ok = false;
655 return 0.0;
656 }
657 return m_impl->toDouble(ok);
658 }
659
toFloat(bool * ok) const660 float String::toFloat(bool* ok) const
661 {
662 if (!m_impl) {
663 if (ok)
664 *ok = false;
665 return 0.0f;
666 }
667 return m_impl->toFloat(ok);
668 }
669
isolatedCopy() const670 String String::isolatedCopy() const
671 {
672 if (!m_impl)
673 return String();
674 return m_impl->isolatedCopy();
675 }
676
isSafeToSendToAnotherThread() const677 bool String::isSafeToSendToAnotherThread() const
678 {
679 if (!impl())
680 return true;
681 if (impl()->isStatic())
682 return true;
683 // AtomicStrings are not safe to send between threads as ~StringImpl()
684 // will try to remove them from the wrong AtomicStringTable.
685 if (impl()->isAtomic())
686 return false;
687 if (impl()->hasOneRef())
688 return true;
689 return false;
690 }
691
split(const String & separator,bool allowEmptyEntries,Vector<String> & result) const692 void String::split(const String& separator, bool allowEmptyEntries, Vector<String>& result) const
693 {
694 result.clear();
695
696 unsigned startPos = 0;
697 size_t endPos;
698 while ((endPos = find(separator, startPos)) != kNotFound) {
699 if (allowEmptyEntries || startPos != endPos)
700 result.append(substring(startPos, endPos - startPos));
701 startPos = endPos + separator.length();
702 }
703 if (allowEmptyEntries || startPos != length())
704 result.append(substring(startPos));
705 }
706
split(UChar separator,bool allowEmptyEntries,Vector<String> & result) const707 void String::split(UChar separator, bool allowEmptyEntries, Vector<String>& result) const
708 {
709 result.clear();
710
711 unsigned startPos = 0;
712 size_t endPos;
713 while ((endPos = find(separator, startPos)) != kNotFound) {
714 if (allowEmptyEntries || startPos != endPos)
715 result.append(substring(startPos, endPos - startPos));
716 startPos = endPos + 1;
717 }
718 if (allowEmptyEntries || startPos != length())
719 result.append(substring(startPos));
720 }
721
ascii() const722 CString String::ascii() const
723 {
724 // Printable ASCII characters 32..127 and the null character are
725 // preserved, characters outside of this range are converted to '?'.
726
727 unsigned length = this->length();
728 if (!length) {
729 char* characterBuffer;
730 return CString::newUninitialized(length, characterBuffer);
731 }
732
733 if (this->is8Bit()) {
734 const LChar* characters = this->characters8();
735
736 char* characterBuffer;
737 CString result = CString::newUninitialized(length, characterBuffer);
738
739 for (unsigned i = 0; i < length; ++i) {
740 LChar ch = characters[i];
741 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
742 }
743
744 return result;
745 }
746
747 const UChar* characters = this->characters16();
748
749 char* characterBuffer;
750 CString result = CString::newUninitialized(length, characterBuffer);
751
752 for (unsigned i = 0; i < length; ++i) {
753 UChar ch = characters[i];
754 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch;
755 }
756
757 return result;
758 }
759
latin1() const760 CString String::latin1() const
761 {
762 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
763 // preserved, characters outside of this range are converted to '?'.
764
765 unsigned length = this->length();
766
767 if (!length)
768 return CString("", 0);
769
770 if (is8Bit())
771 return CString(reinterpret_cast<const char*>(this->characters8()), length);
772
773 const UChar* characters = this->characters16();
774
775 char* characterBuffer;
776 CString result = CString::newUninitialized(length, characterBuffer);
777
778 for (unsigned i = 0; i < length; ++i) {
779 UChar ch = characters[i];
780 characterBuffer[i] = ch > 0xff ? '?' : ch;
781 }
782
783 return result;
784 }
785
786 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
putUTF8Triple(char * & buffer,UChar ch)787 static inline void putUTF8Triple(char*& buffer, UChar ch)
788 {
789 ASSERT(ch >= 0x0800);
790 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
791 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
792 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
793 }
794
utf8(ConversionMode mode) const795 CString String::utf8(ConversionMode mode) const
796 {
797 unsigned length = this->length();
798
799 if (!length)
800 return CString("", 0);
801
802 // Allocate a buffer big enough to hold all the characters
803 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
804 // Optimization ideas, if we find this function is hot:
805 // * We could speculatively create a CStringBuffer to contain 'length'
806 // characters, and resize if necessary (i.e. if the buffer contains
807 // non-ascii characters). (Alternatively, scan the buffer first for
808 // ascii characters, so we know this will be sufficient).
809 // * We could allocate a CStringBuffer with an appropriate size to
810 // have a good chance of being able to write the string into the
811 // buffer without reallocing (say, 1.5 x length).
812 if (length > numeric_limits<unsigned>::max() / 3)
813 return CString();
814 Vector<char, 1024> bufferVector(length * 3);
815
816 char* buffer = bufferVector.data();
817
818 if (is8Bit()) {
819 const LChar* characters = this->characters8();
820
821 ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size());
822 ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion
823 } else {
824 const UChar* characters = this->characters16();
825
826 if (mode == StrictConversionReplacingUnpairedSurrogatesWithFFFD) {
827 const UChar* charactersEnd = characters + length;
828 char* bufferEnd = buffer + bufferVector.size();
829 while (characters < charactersEnd) {
830 // Use strict conversion to detect unpaired surrogates.
831 ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, &buffer, bufferEnd, true);
832 ASSERT(result != targetExhausted);
833 // Conversion fails when there is an unpaired surrogate.
834 // Put replacement character (U+FFFD) instead of the unpaired surrogate.
835 if (result != conversionOK) {
836 ASSERT((0xD800 <= *characters && *characters <= 0xDFFF));
837 // There should be room left, since one UChar hasn't been converted.
838 ASSERT((buffer + 3) <= bufferEnd);
839 putUTF8Triple(buffer, replacementCharacter);
840 ++characters;
841 }
842 }
843 } else {
844 bool strict = mode == StrictConversion;
845 ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
846 ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
847
848 // Only produced from strict conversion.
849 if (result == sourceIllegal) {
850 ASSERT(strict);
851 return CString();
852 }
853
854 // Check for an unconverted high surrogate.
855 if (result == sourceExhausted) {
856 if (strict)
857 return CString();
858 // This should be one unpaired high surrogate. Treat it the same
859 // was as an unpaired high surrogate would have been handled in
860 // the middle of a string with non-strict conversion - which is
861 // to say, simply encode it to UTF-8.
862 ASSERT((characters + 1) == (this->characters16() + length));
863 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
864 // There should be room left, since one UChar hasn't been converted.
865 ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
866 putUTF8Triple(buffer, *characters);
867 }
868 }
869 }
870
871 return CString(bufferVector.data(), buffer - bufferVector.data());
872 }
873
make8BitFrom16BitSource(const UChar * source,size_t length)874 String String::make8BitFrom16BitSource(const UChar* source, size_t length)
875 {
876 if (!length)
877 return String();
878
879 LChar* destination;
880 String result = String::createUninitialized(length, destination);
881
882 copyLCharsFromUCharSource(destination, source, length);
883
884 return result;
885 }
886
make16BitFrom8BitSource(const LChar * source,size_t length)887 String String::make16BitFrom8BitSource(const LChar* source, size_t length)
888 {
889 if (!length)
890 return String();
891
892 UChar* destination;
893 String result = String::createUninitialized(length, destination);
894
895 StringImpl::copyChars(destination, source, length);
896
897 return result;
898 }
899
fromUTF8(const LChar * stringStart,size_t length)900 String String::fromUTF8(const LChar* stringStart, size_t length)
901 {
902 RELEASE_ASSERT(length <= numeric_limits<unsigned>::max());
903
904 if (!stringStart)
905 return String();
906
907 if (!length)
908 return emptyString();
909
910 if (charactersAreAllASCII(stringStart, length))
911 return StringImpl::create(stringStart, length);
912
913 Vector<UChar, 1024> buffer(length);
914 UChar* bufferStart = buffer.data();
915
916 UChar* bufferCurrent = bufferStart;
917 const char* stringCurrent = reinterpret_cast<const char*>(stringStart);
918 if (convertUTF8ToUTF16(&stringCurrent, reinterpret_cast<const char *>(stringStart + length), &bufferCurrent, bufferCurrent + buffer.size()) != conversionOK)
919 return String();
920
921 unsigned utf16Length = bufferCurrent - bufferStart;
922 ASSERT(utf16Length < length);
923 return StringImpl::create(bufferStart, utf16Length);
924 }
925
fromUTF8(const LChar * string)926 String String::fromUTF8(const LChar* string)
927 {
928 if (!string)
929 return String();
930 return fromUTF8(string, strlen(reinterpret_cast<const char*>(string)));
931 }
932
fromUTF8(const CString & s)933 String String::fromUTF8(const CString& s)
934 {
935 return fromUTF8(s.data());
936 }
937
fromUTF8WithLatin1Fallback(const LChar * string,size_t size)938 String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size)
939 {
940 String utf8 = fromUTF8(string, size);
941 if (!utf8)
942 return String(string, size);
943 return utf8;
944 }
945
946 // String Operations
947
isCharacterAllowedInBase(UChar c,int base)948 static bool isCharacterAllowedInBase(UChar c, int base)
949 {
950 if (c > 0x7F)
951 return false;
952 if (isASCIIDigit(c))
953 return c - '0' < base;
954 if (isASCIIAlpha(c)) {
955 if (base > 36)
956 base = 36;
957 return (c >= 'a' && c < 'a' + base - 10)
958 || (c >= 'A' && c < 'A' + base - 10);
959 }
960 return false;
961 }
962
963 template <typename IntegralType, typename CharType>
toIntegralType(const CharType * data,size_t length,bool * ok,int base)964 static inline IntegralType toIntegralType(const CharType* data, size_t length, bool* ok, int base)
965 {
966 static const IntegralType integralMax = numeric_limits<IntegralType>::max();
967 static const bool isSigned = numeric_limits<IntegralType>::is_signed;
968 const IntegralType maxMultiplier = integralMax / base;
969
970 IntegralType value = 0;
971 bool isOk = false;
972 bool isNegative = false;
973
974 if (!data)
975 goto bye;
976
977 // skip leading whitespace
978 while (length && isSpaceOrNewline(*data)) {
979 --length;
980 ++data;
981 }
982
983 if (isSigned && length && *data == '-') {
984 --length;
985 ++data;
986 isNegative = true;
987 } else if (length && *data == '+') {
988 --length;
989 ++data;
990 }
991
992 if (!length || !isCharacterAllowedInBase(*data, base))
993 goto bye;
994
995 while (length && isCharacterAllowedInBase(*data, base)) {
996 --length;
997 IntegralType digitValue;
998 CharType c = *data;
999 if (isASCIIDigit(c))
1000 digitValue = c - '0';
1001 else if (c >= 'a')
1002 digitValue = c - 'a' + 10;
1003 else
1004 digitValue = c - 'A' + 10;
1005
1006 if (value > maxMultiplier || (value == maxMultiplier && digitValue > (integralMax % base) + isNegative))
1007 goto bye;
1008
1009 value = base * value + digitValue;
1010 ++data;
1011 }
1012
1013 #if COMPILER(MSVC)
1014 #pragma warning(push, 0)
1015 #pragma warning(disable:4146)
1016 #endif
1017
1018 if (isNegative)
1019 value = -value;
1020
1021 #if COMPILER(MSVC)
1022 #pragma warning(pop)
1023 #endif
1024
1025 // skip trailing space
1026 while (length && isSpaceOrNewline(*data)) {
1027 --length;
1028 ++data;
1029 }
1030
1031 if (!length)
1032 isOk = true;
1033 bye:
1034 if (ok)
1035 *ok = isOk;
1036 return isOk ? value : 0;
1037 }
1038
1039 template <typename CharType>
lengthOfCharactersAsInteger(const CharType * data,size_t length)1040 static unsigned lengthOfCharactersAsInteger(const CharType* data, size_t length)
1041 {
1042 size_t i = 0;
1043
1044 // Allow leading spaces.
1045 for (; i != length; ++i) {
1046 if (!isSpaceOrNewline(data[i]))
1047 break;
1048 }
1049
1050 // Allow sign.
1051 if (i != length && (data[i] == '+' || data[i] == '-'))
1052 ++i;
1053
1054 // Allow digits.
1055 for (; i != length; ++i) {
1056 if (!isASCIIDigit(data[i]))
1057 break;
1058 }
1059
1060 return i;
1061 }
1062
charactersToIntStrict(const LChar * data,size_t length,bool * ok,int base)1063 int charactersToIntStrict(const LChar* data, size_t length, bool* ok, int base)
1064 {
1065 return toIntegralType<int, LChar>(data, length, ok, base);
1066 }
1067
charactersToIntStrict(const UChar * data,size_t length,bool * ok,int base)1068 int charactersToIntStrict(const UChar* data, size_t length, bool* ok, int base)
1069 {
1070 return toIntegralType<int, UChar>(data, length, ok, base);
1071 }
1072
charactersToUIntStrict(const LChar * data,size_t length,bool * ok,int base)1073 unsigned charactersToUIntStrict(const LChar* data, size_t length, bool* ok, int base)
1074 {
1075 return toIntegralType<unsigned, LChar>(data, length, ok, base);
1076 }
1077
charactersToUIntStrict(const UChar * data,size_t length,bool * ok,int base)1078 unsigned charactersToUIntStrict(const UChar* data, size_t length, bool* ok, int base)
1079 {
1080 return toIntegralType<unsigned, UChar>(data, length, ok, base);
1081 }
1082
charactersToInt64Strict(const LChar * data,size_t length,bool * ok,int base)1083 int64_t charactersToInt64Strict(const LChar* data, size_t length, bool* ok, int base)
1084 {
1085 return toIntegralType<int64_t, LChar>(data, length, ok, base);
1086 }
1087
charactersToInt64Strict(const UChar * data,size_t length,bool * ok,int base)1088 int64_t charactersToInt64Strict(const UChar* data, size_t length, bool* ok, int base)
1089 {
1090 return toIntegralType<int64_t, UChar>(data, length, ok, base);
1091 }
1092
charactersToUInt64Strict(const LChar * data,size_t length,bool * ok,int base)1093 uint64_t charactersToUInt64Strict(const LChar* data, size_t length, bool* ok, int base)
1094 {
1095 return toIntegralType<uint64_t, LChar>(data, length, ok, base);
1096 }
1097
charactersToUInt64Strict(const UChar * data,size_t length,bool * ok,int base)1098 uint64_t charactersToUInt64Strict(const UChar* data, size_t length, bool* ok, int base)
1099 {
1100 return toIntegralType<uint64_t, UChar>(data, length, ok, base);
1101 }
1102
charactersToIntPtrStrict(const LChar * data,size_t length,bool * ok,int base)1103 intptr_t charactersToIntPtrStrict(const LChar* data, size_t length, bool* ok, int base)
1104 {
1105 return toIntegralType<intptr_t, LChar>(data, length, ok, base);
1106 }
1107
charactersToIntPtrStrict(const UChar * data,size_t length,bool * ok,int base)1108 intptr_t charactersToIntPtrStrict(const UChar* data, size_t length, bool* ok, int base)
1109 {
1110 return toIntegralType<intptr_t, UChar>(data, length, ok, base);
1111 }
1112
charactersToInt(const LChar * data,size_t length,bool * ok)1113 int charactersToInt(const LChar* data, size_t length, bool* ok)
1114 {
1115 return toIntegralType<int, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1116 }
1117
charactersToInt(const UChar * data,size_t length,bool * ok)1118 int charactersToInt(const UChar* data, size_t length, bool* ok)
1119 {
1120 return toIntegralType<int, UChar>(data, lengthOfCharactersAsInteger(data, length), ok, 10);
1121 }
1122
charactersToUInt(const LChar * data,size_t length,bool * ok)1123 unsigned charactersToUInt(const LChar* data, size_t length, bool* ok)
1124 {
1125 return toIntegralType<unsigned, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1126 }
1127
charactersToUInt(const UChar * data,size_t length,bool * ok)1128 unsigned charactersToUInt(const UChar* data, size_t length, bool* ok)
1129 {
1130 return toIntegralType<unsigned, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1131 }
1132
charactersToInt64(const LChar * data,size_t length,bool * ok)1133 int64_t charactersToInt64(const LChar* data, size_t length, bool* ok)
1134 {
1135 return toIntegralType<int64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1136 }
1137
charactersToInt64(const UChar * data,size_t length,bool * ok)1138 int64_t charactersToInt64(const UChar* data, size_t length, bool* ok)
1139 {
1140 return toIntegralType<int64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1141 }
1142
charactersToUInt64(const LChar * data,size_t length,bool * ok)1143 uint64_t charactersToUInt64(const LChar* data, size_t length, bool* ok)
1144 {
1145 return toIntegralType<uint64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1146 }
1147
charactersToUInt64(const UChar * data,size_t length,bool * ok)1148 uint64_t charactersToUInt64(const UChar* data, size_t length, bool* ok)
1149 {
1150 return toIntegralType<uint64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1151 }
1152
charactersToIntPtr(const LChar * data,size_t length,bool * ok)1153 intptr_t charactersToIntPtr(const LChar* data, size_t length, bool* ok)
1154 {
1155 return toIntegralType<intptr_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10);
1156 }
1157
charactersToIntPtr(const UChar * data,size_t length,bool * ok)1158 intptr_t charactersToIntPtr(const UChar* data, size_t length, bool* ok)
1159 {
1160 return toIntegralType<intptr_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10);
1161 }
1162
1163 enum TrailingJunkPolicy { DisallowTrailingJunk, AllowTrailingJunk };
1164
1165 template <typename CharType, TrailingJunkPolicy policy>
toDoubleType(const CharType * data,size_t length,bool * ok,size_t & parsedLength)1166 static inline double toDoubleType(const CharType* data, size_t length, bool* ok, size_t& parsedLength)
1167 {
1168 size_t leadingSpacesLength = 0;
1169 while (leadingSpacesLength < length && isASCIISpace(data[leadingSpacesLength]))
1170 ++leadingSpacesLength;
1171
1172 double number = parseDouble(data + leadingSpacesLength, length - leadingSpacesLength, parsedLength);
1173 if (!parsedLength) {
1174 if (ok)
1175 *ok = false;
1176 return 0.0;
1177 }
1178
1179 parsedLength += leadingSpacesLength;
1180 if (ok)
1181 *ok = policy == AllowTrailingJunk || parsedLength == length;
1182 return number;
1183 }
1184
charactersToDouble(const LChar * data,size_t length,bool * ok)1185 double charactersToDouble(const LChar* data, size_t length, bool* ok)
1186 {
1187 size_t parsedLength;
1188 return toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength);
1189 }
1190
charactersToDouble(const UChar * data,size_t length,bool * ok)1191 double charactersToDouble(const UChar* data, size_t length, bool* ok)
1192 {
1193 size_t parsedLength;
1194 return toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength);
1195 }
1196
charactersToFloat(const LChar * data,size_t length,bool * ok)1197 float charactersToFloat(const LChar* data, size_t length, bool* ok)
1198 {
1199 // FIXME: This will return ok even when the string fits into a double but not a float.
1200 size_t parsedLength;
1201 return static_cast<float>(toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength));
1202 }
1203
charactersToFloat(const UChar * data,size_t length,bool * ok)1204 float charactersToFloat(const UChar* data, size_t length, bool* ok)
1205 {
1206 // FIXME: This will return ok even when the string fits into a double but not a float.
1207 size_t parsedLength;
1208 return static_cast<float>(toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength));
1209 }
1210
charactersToFloat(const LChar * data,size_t length,size_t & parsedLength)1211 float charactersToFloat(const LChar* data, size_t length, size_t& parsedLength)
1212 {
1213 // FIXME: This will return ok even when the string fits into a double but not a float.
1214 return static_cast<float>(toDoubleType<LChar, AllowTrailingJunk>(data, length, 0, parsedLength));
1215 }
1216
charactersToFloat(const UChar * data,size_t length,size_t & parsedLength)1217 float charactersToFloat(const UChar* data, size_t length, size_t& parsedLength)
1218 {
1219 // FIXME: This will return ok even when the string fits into a double but not a float.
1220 return static_cast<float>(toDoubleType<UChar, AllowTrailingJunk>(data, length, 0, parsedLength));
1221 }
1222
emptyString()1223 const String& emptyString()
1224 {
1225 DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty()));
1226 return emptyString;
1227 }
1228
1229 } // namespace WTF
1230
1231 #ifndef NDEBUG
1232 // For use in the debugger
1233 String* string(const char*);
1234 Vector<char> asciiDebug(StringImpl* impl);
1235 Vector<char> asciiDebug(String& string);
1236
show() const1237 void String::show() const
1238 {
1239 dataLogF("%s\n", asciiDebug(impl()).data());
1240 }
1241
string(const char * s)1242 String* string(const char* s)
1243 {
1244 // leaks memory!
1245 return new String(s);
1246 }
1247
asciiDebug(StringImpl * impl)1248 Vector<char> asciiDebug(StringImpl* impl)
1249 {
1250 if (!impl)
1251 return asciiDebug(String("[null]").impl());
1252
1253 Vector<char> buffer;
1254 for (unsigned i = 0; i < impl->length(); ++i) {
1255 UChar ch = (*impl)[i];
1256 if (isASCIIPrintable(ch)) {
1257 if (ch == '\\')
1258 buffer.append(ch);
1259 buffer.append(ch);
1260 } else {
1261 buffer.append('\\');
1262 buffer.append('u');
1263 appendUnsignedAsHexFixedSize(ch, buffer, 4);
1264 }
1265 }
1266 buffer.append('\0');
1267 return buffer;
1268 }
1269
asciiDebug(String & string)1270 Vector<char> asciiDebug(String& string)
1271 {
1272 return asciiDebug(string.impl());
1273 }
1274
1275 #endif
1276