1 /*
2 * Copyright (C) 1999-2000 Harri Porten (porten@kde.org)
3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich@uwaterloo.ca)
5 * Copyright (C) 2009 Google Inc. All rights reserved.
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Library General Public
9 * License as published by the Free Software Foundation; either
10 * version 2 of the License, or (at your option) any later version.
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Library General Public License for more details.
16 *
17 * You should have received a copy of the GNU Library General Public License
18 * along with this library; see the file COPYING.LIB. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 * Boston, MA 02110-1301, USA.
21 *
22 */
23
24 #include "config.h"
25 #include "UString.h"
26
27 #include "JSGlobalObjectFunctions.h"
28 #include "Heap.h"
29 #include "Identifier.h"
30 #include "Operations.h"
31 #include <ctype.h>
32 #include <limits.h>
33 #include <limits>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <wtf/ASCIICType.h>
37 #include <wtf/Assertions.h>
38 #include <wtf/DecimalNumber.h>
39 #include <wtf/MathExtras.h>
40 #include <wtf/StringExtras.h>
41 #include <wtf/Vector.h>
42 #include <wtf/unicode/UTF8.h>
43
44 #if HAVE(STRINGS_H)
45 #include <strings.h>
46 #endif
47
48 using namespace WTF;
49 using namespace WTF::Unicode;
50 using namespace std;
51
52 namespace JSC {
53
54 extern const double NaN;
55 extern const double Inf;
56
57 COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small);
58
59 // Construct a string with UTF-16 data.
UString(const UChar * characters,unsigned length)60 UString::UString(const UChar* characters, unsigned length)
61 : m_impl(characters ? StringImpl::create(characters, length) : 0)
62 {
63 }
64
65 // Construct a string with UTF-16 data, from a null-terminated source.
UString(const UChar * characters)66 UString::UString(const UChar* characters)
67 {
68 if (!characters)
69 return;
70
71 int length = 0;
72 while (characters[length] != UChar(0))
73 ++length;
74
75 m_impl = StringImpl::create(characters, length);
76 }
77
78 // Construct a string with latin1 data.
UString(const char * characters,unsigned length)79 UString::UString(const char* characters, unsigned length)
80 : m_impl(characters ? StringImpl::create(characters, length) : 0)
81 {
82 }
83
84 // Construct a string with latin1 data, from a null-terminated source.
UString(const char * characters)85 UString::UString(const char* characters)
86 : m_impl(characters ? StringImpl::create(characters) : 0)
87 {
88 }
89
number(int i)90 UString UString::number(int i)
91 {
92 UChar buf[1 + sizeof(i) * 3];
93 UChar* end = buf + WTF_ARRAY_LENGTH(buf);
94 UChar* p = end;
95
96 if (i == 0)
97 *--p = '0';
98 else if (i == INT_MIN) {
99 char minBuf[1 + sizeof(i) * 3];
100 snprintf(minBuf, sizeof(minBuf), "%d", INT_MIN);
101 return UString(minBuf);
102 } else {
103 bool negative = false;
104 if (i < 0) {
105 negative = true;
106 i = -i;
107 }
108 while (i) {
109 *--p = static_cast<unsigned short>((i % 10) + '0');
110 i /= 10;
111 }
112 if (negative)
113 *--p = '-';
114 }
115
116 return UString(p, static_cast<unsigned>(end - p));
117 }
118
number(long long i)119 UString UString::number(long long i)
120 {
121 UChar buf[1 + sizeof(i) * 3];
122 UChar* end = buf + WTF_ARRAY_LENGTH(buf);
123 UChar* p = end;
124
125 if (i == 0)
126 *--p = '0';
127 else if (i == std::numeric_limits<long long>::min()) {
128 char minBuf[1 + sizeof(i) * 3];
129 #if OS(WINDOWS)
130 snprintf(minBuf, sizeof(minBuf), "%I64d", std::numeric_limits<long long>::min());
131 #else
132 snprintf(minBuf, sizeof(minBuf), "%lld", std::numeric_limits<long long>::min());
133 #endif
134 return UString(minBuf);
135 } else {
136 bool negative = false;
137 if (i < 0) {
138 negative = true;
139 i = -i;
140 }
141 while (i) {
142 *--p = static_cast<unsigned short>((i % 10) + '0');
143 i /= 10;
144 }
145 if (negative)
146 *--p = '-';
147 }
148
149 return UString(p, static_cast<unsigned>(end - p));
150 }
151
number(unsigned u)152 UString UString::number(unsigned u)
153 {
154 UChar buf[sizeof(u) * 3];
155 UChar* end = buf + WTF_ARRAY_LENGTH(buf);
156 UChar* p = end;
157
158 if (u == 0)
159 *--p = '0';
160 else {
161 while (u) {
162 *--p = static_cast<unsigned short>((u % 10) + '0');
163 u /= 10;
164 }
165 }
166
167 return UString(p, static_cast<unsigned>(end - p));
168 }
169
number(long l)170 UString UString::number(long l)
171 {
172 UChar buf[1 + sizeof(l) * 3];
173 UChar* end = buf + WTF_ARRAY_LENGTH(buf);
174 UChar* p = end;
175
176 if (l == 0)
177 *--p = '0';
178 else if (l == LONG_MIN) {
179 char minBuf[1 + sizeof(l) * 3];
180 snprintf(minBuf, sizeof(minBuf), "%ld", LONG_MIN);
181 return UString(minBuf);
182 } else {
183 bool negative = false;
184 if (l < 0) {
185 negative = true;
186 l = -l;
187 }
188 while (l) {
189 *--p = static_cast<unsigned short>((l % 10) + '0');
190 l /= 10;
191 }
192 if (negative)
193 *--p = '-';
194 }
195
196 return UString(p, end - p);
197 }
198
number(double d)199 UString UString::number(double d)
200 {
201 NumberToStringBuffer buffer;
202 unsigned length = numberToString(d, buffer);
203 return UString(buffer, length);
204 }
205
substringSharingImpl(unsigned offset,unsigned length) const206 UString UString::substringSharingImpl(unsigned offset, unsigned length) const
207 {
208 // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar).
209
210 unsigned stringLength = this->length();
211 offset = min(offset, stringLength);
212 length = min(length, stringLength - offset);
213
214 if (!offset && length == stringLength)
215 return *this;
216 return UString(StringImpl::create(m_impl, offset, length));
217 }
218
operator ==(const UString & s1,const char * s2)219 bool operator==(const UString& s1, const char *s2)
220 {
221 if (s2 == 0)
222 return s1.isEmpty();
223
224 const UChar* u = s1.characters();
225 const UChar* uend = u + s1.length();
226 while (u != uend && *s2) {
227 if (u[0] != (unsigned char)*s2)
228 return false;
229 s2++;
230 u++;
231 }
232
233 return u == uend && *s2 == 0;
234 }
235
operator <(const UString & s1,const UString & s2)236 bool operator<(const UString& s1, const UString& s2)
237 {
238 const unsigned l1 = s1.length();
239 const unsigned l2 = s2.length();
240 const unsigned lmin = l1 < l2 ? l1 : l2;
241 const UChar* c1 = s1.characters();
242 const UChar* c2 = s2.characters();
243 unsigned l = 0;
244 while (l < lmin && *c1 == *c2) {
245 c1++;
246 c2++;
247 l++;
248 }
249 if (l < lmin)
250 return (c1[0] < c2[0]);
251
252 return (l1 < l2);
253 }
254
operator >(const UString & s1,const UString & s2)255 bool operator>(const UString& s1, const UString& s2)
256 {
257 const unsigned l1 = s1.length();
258 const unsigned l2 = s2.length();
259 const unsigned lmin = l1 < l2 ? l1 : l2;
260 const UChar* c1 = s1.characters();
261 const UChar* c2 = s2.characters();
262 unsigned l = 0;
263 while (l < lmin && *c1 == *c2) {
264 c1++;
265 c2++;
266 l++;
267 }
268 if (l < lmin)
269 return (c1[0] > c2[0]);
270
271 return (l1 > l2);
272 }
273
ascii() const274 CString UString::ascii() const
275 {
276 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
277 // preserved, characters outside of this range are converted to '?'.
278
279 unsigned length = this->length();
280 const UChar* characters = this->characters();
281
282 char* characterBuffer;
283 CString result = CString::newUninitialized(length, characterBuffer);
284
285 for (unsigned i = 0; i < length; ++i) {
286 UChar ch = characters[i];
287 characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch;
288 }
289
290 return result;
291 }
292
latin1() const293 CString UString::latin1() const
294 {
295 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are
296 // preserved, characters outside of this range are converted to '?'.
297
298 unsigned length = this->length();
299 const UChar* characters = this->characters();
300
301 char* characterBuffer;
302 CString result = CString::newUninitialized(length, characterBuffer);
303
304 for (unsigned i = 0; i < length; ++i) {
305 UChar ch = characters[i];
306 characterBuffer[i] = ch > 0xff ? '?' : ch;
307 }
308
309 return result;
310 }
311
312 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available.
putUTF8Triple(char * & buffer,UChar ch)313 static inline void putUTF8Triple(char*& buffer, UChar ch)
314 {
315 ASSERT(ch >= 0x0800);
316 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
317 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
318 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
319 }
320
utf8(bool strict) const321 CString UString::utf8(bool strict) const
322 {
323 unsigned length = this->length();
324 const UChar* characters = this->characters();
325
326 // Allocate a buffer big enough to hold all the characters
327 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
328 // Optimization ideas, if we find this function is hot:
329 // * We could speculatively create a CStringBuffer to contain 'length'
330 // characters, and resize if necessary (i.e. if the buffer contains
331 // non-ascii characters). (Alternatively, scan the buffer first for
332 // ascii characters, so we know this will be sufficient).
333 // * We could allocate a CStringBuffer with an appropriate size to
334 // have a good chance of being able to write the string into the
335 // buffer without reallocing (say, 1.5 x length).
336 if (length > numeric_limits<unsigned>::max() / 3)
337 return CString();
338 Vector<char, 1024> bufferVector(length * 3);
339
340 char* buffer = bufferVector.data();
341 ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict);
342 ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion
343
344 // Only produced from strict conversion.
345 if (result == sourceIllegal)
346 return CString();
347
348 // Check for an unconverted high surrogate.
349 if (result == sourceExhausted) {
350 if (strict)
351 return CString();
352 // This should be one unpaired high surrogate. Treat it the same
353 // was as an unpaired high surrogate would have been handled in
354 // the middle of a string with non-strict conversion - which is
355 // to say, simply encode it to UTF-8.
356 ASSERT((characters + 1) == (this->characters() + length));
357 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF));
358 // There should be room left, since one UChar hasn't been converted.
359 ASSERT((buffer + 3) <= (buffer + bufferVector.size()));
360 putUTF8Triple(buffer, *characters);
361 }
362
363 return CString(bufferVector.data(), buffer - bufferVector.data());
364 }
365
366 } // namespace JSC
367