• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/inspector/string-16.h"
6 
7 #include <algorithm>
8 #include <cctype>
9 #include <cstdlib>
10 #include <cstring>
11 #include <limits>
12 #include <string>
13 
14 #include "src/base/platform/platform.h"
15 #include "src/conversions.h"
16 
17 namespace v8_inspector {
18 
19 namespace {
20 
isASCII(UChar c)21 bool isASCII(UChar c) { return !(c & ~0x7F); }
22 
isSpaceOrNewLine(UChar c)23 bool isSpaceOrNewLine(UChar c) {
24   return isASCII(c) && c <= ' ' && (c == ' ' || (c <= 0xD && c >= 0x9));
25 }
26 
charactersToInteger(const UChar * characters,size_t length,bool * ok=nullptr)27 int charactersToInteger(const UChar* characters, size_t length,
28                         bool* ok = nullptr) {
29   std::vector<char> buffer;
30   buffer.reserve(length + 1);
31   for (size_t i = 0; i < length; ++i) {
32     if (!isASCII(characters[i])) {
33       if (ok) *ok = false;
34       return 0;
35     }
36     buffer.push_back(static_cast<char>(characters[i]));
37   }
38   buffer.push_back('\0');
39 
40   char* endptr;
41   int64_t result =
42       static_cast<int64_t>(std::strtol(buffer.data(), &endptr, 10));
43   if (ok) {
44     *ok = !(*endptr) && result <= std::numeric_limits<int>::max() &&
45           result >= std::numeric_limits<int>::min();
46   }
47   return static_cast<int>(result);
48 }
49 
50 const UChar replacementCharacter = 0xFFFD;
51 using UChar32 = uint32_t;
52 
inlineUTF8SequenceLengthNonASCII(char b0)53 inline int inlineUTF8SequenceLengthNonASCII(char b0) {
54   if ((b0 & 0xC0) != 0xC0) return 0;
55   if ((b0 & 0xE0) == 0xC0) return 2;
56   if ((b0 & 0xF0) == 0xE0) return 3;
57   if ((b0 & 0xF8) == 0xF0) return 4;
58   return 0;
59 }
60 
inlineUTF8SequenceLength(char b0)61 inline int inlineUTF8SequenceLength(char b0) {
62   return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
63 }
64 
65 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
66 // into the first byte, depending on how many bytes follow.  There are
67 // as many entries in this table as there are UTF-8 sequence types.
68 // (I.e., one byte sequence, two byte... etc.). Remember that sequences
69 // for *legal* UTF-8 will be 4 or fewer bytes total.
70 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0,
71                                                0xF0, 0xF8, 0xFC};
72 
73 typedef enum {
74   conversionOK,     // conversion successful
75   sourceExhausted,  // partial character in source, but hit end
76   targetExhausted,  // insuff. room in target for conversion
77   sourceIllegal     // source sequence is illegal/malformed
78 } ConversionResult;
79 
convertUTF16ToUTF8(const UChar ** sourceStart,const UChar * sourceEnd,char ** targetStart,char * targetEnd,bool strict)80 ConversionResult convertUTF16ToUTF8(const UChar** sourceStart,
81                                     const UChar* sourceEnd, char** targetStart,
82                                     char* targetEnd, bool strict) {
83   ConversionResult result = conversionOK;
84   const UChar* source = *sourceStart;
85   char* target = *targetStart;
86   while (source < sourceEnd) {
87     UChar32 ch;
88     uint32_t bytesToWrite = 0;
89     const UChar32 byteMask = 0xBF;
90     const UChar32 byteMark = 0x80;
91     const UChar* oldSource =
92         source;  // In case we have to back up because of target overflow.
93     ch = static_cast<uint16_t>(*source++);
94     // If we have a surrogate pair, convert to UChar32 first.
95     if (ch >= 0xD800 && ch <= 0xDBFF) {
96       // If the 16 bits following the high surrogate are in the source buffer...
97       if (source < sourceEnd) {
98         UChar32 ch2 = static_cast<uint16_t>(*source);
99         // If it's a low surrogate, convert to UChar32.
100         if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
101           ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
102           ++source;
103         } else if (strict) {  // it's an unpaired high surrogate
104           --source;           // return to the illegal value itself
105           result = sourceIllegal;
106           break;
107         }
108       } else {     // We don't have the 16 bits following the high surrogate.
109         --source;  // return to the high surrogate
110         result = sourceExhausted;
111         break;
112       }
113     } else if (strict) {
114       // UTF-16 surrogate values are illegal in UTF-32
115       if (ch >= 0xDC00 && ch <= 0xDFFF) {
116         --source;  // return to the illegal value itself
117         result = sourceIllegal;
118         break;
119       }
120     }
121     // Figure out how many bytes the result will require
122     if (ch < (UChar32)0x80) {
123       bytesToWrite = 1;
124     } else if (ch < (UChar32)0x800) {
125       bytesToWrite = 2;
126     } else if (ch < (UChar32)0x10000) {
127       bytesToWrite = 3;
128     } else if (ch < (UChar32)0x110000) {
129       bytesToWrite = 4;
130     } else {
131       bytesToWrite = 3;
132       ch = replacementCharacter;
133     }
134 
135     target += bytesToWrite;
136     if (target > targetEnd) {
137       source = oldSource;  // Back up source pointer!
138       target -= bytesToWrite;
139       result = targetExhausted;
140       break;
141     }
142     switch (bytesToWrite) {  // note: everything falls through.
143       case 4:
144         *--target = static_cast<char>((ch | byteMark) & byteMask);
145         ch >>= 6;
146       case 3:
147         *--target = static_cast<char>((ch | byteMark) & byteMask);
148         ch >>= 6;
149       case 2:
150         *--target = static_cast<char>((ch | byteMark) & byteMask);
151         ch >>= 6;
152       case 1:
153         *--target = static_cast<char>(ch | firstByteMark[bytesToWrite]);
154     }
155     target += bytesToWrite;
156   }
157   *sourceStart = source;
158   *targetStart = target;
159   return result;
160 }
161 
162 /**
163  * Is this code point a BMP code point (U+0000..U+ffff)?
164  * @param c 32-bit code point
165  * @return TRUE or FALSE
166  * @stable ICU 2.8
167  */
168 #define U_IS_BMP(c) ((uint32_t)(c) <= 0xffff)
169 
170 /**
171  * Is this code point a supplementary code point (U+10000..U+10ffff)?
172  * @param c 32-bit code point
173  * @return TRUE or FALSE
174  * @stable ICU 2.8
175  */
176 #define U_IS_SUPPLEMENTARY(c) ((uint32_t)((c)-0x10000) <= 0xfffff)
177 
178 /**
179  * Is this code point a surrogate (U+d800..U+dfff)?
180  * @param c 32-bit code point
181  * @return TRUE or FALSE
182  * @stable ICU 2.4
183  */
184 #define U_IS_SURROGATE(c) (((c)&0xfffff800) == 0xd800)
185 
186 /**
187  * Get the lead surrogate (0xd800..0xdbff) for a
188  * supplementary code point (0x10000..0x10ffff).
189  * @param supplementary 32-bit code point (U+10000..U+10ffff)
190  * @return lead surrogate (U+d800..U+dbff) for supplementary
191  * @stable ICU 2.4
192  */
193 #define U16_LEAD(supplementary) (UChar)(((supplementary) >> 10) + 0xd7c0)
194 
195 /**
196  * Get the trail surrogate (0xdc00..0xdfff) for a
197  * supplementary code point (0x10000..0x10ffff).
198  * @param supplementary 32-bit code point (U+10000..U+10ffff)
199  * @return trail surrogate (U+dc00..U+dfff) for supplementary
200  * @stable ICU 2.4
201  */
202 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff) | 0xdc00)
203 
204 // This must be called with the length pre-determined by the first byte.
205 // If presented with a length > 4, this returns false.  The Unicode
206 // definition of UTF-8 goes up to 4-byte sequences.
isLegalUTF8(const unsigned char * source,int length)207 static bool isLegalUTF8(const unsigned char* source, int length) {
208   unsigned char a;
209   const unsigned char* srcptr = source + length;
210   switch (length) {
211     default:
212       return false;
213     // Everything else falls through when "true"...
214     case 4:
215       if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
216     case 3:
217       if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
218     case 2:
219       if ((a = (*--srcptr)) > 0xBF) return false;
220 
221       // no fall-through in this inner switch
222       switch (*source) {
223         case 0xE0:
224           if (a < 0xA0) return false;
225           break;
226         case 0xED:
227           if (a > 0x9F) return false;
228           break;
229         case 0xF0:
230           if (a < 0x90) return false;
231           break;
232         case 0xF4:
233           if (a > 0x8F) return false;
234           break;
235         default:
236           if (a < 0x80) return false;
237       }
238 
239     case 1:
240       if (*source >= 0x80 && *source < 0xC2) return false;
241   }
242   if (*source > 0xF4) return false;
243   return true;
244 }
245 
246 // Magic values subtracted from a buffer value during UTF8 conversion.
247 // This table contains as many values as there might be trailing bytes
248 // in a UTF-8 sequence.
249 static const UChar32 offsetsFromUTF8[6] = {0x00000000UL,
250                                            0x00003080UL,
251                                            0x000E2080UL,
252                                            0x03C82080UL,
253                                            static_cast<UChar32>(0xFA082080UL),
254                                            static_cast<UChar32>(0x82082080UL)};
255 
readUTF8Sequence(const char * & sequence,size_t length)256 static inline UChar32 readUTF8Sequence(const char*& sequence, size_t length) {
257   UChar32 character = 0;
258 
259   // The cases all fall through.
260   switch (length) {
261     case 6:
262       character += static_cast<unsigned char>(*sequence++);
263       character <<= 6;
264     case 5:
265       character += static_cast<unsigned char>(*sequence++);
266       character <<= 6;
267     case 4:
268       character += static_cast<unsigned char>(*sequence++);
269       character <<= 6;
270     case 3:
271       character += static_cast<unsigned char>(*sequence++);
272       character <<= 6;
273     case 2:
274       character += static_cast<unsigned char>(*sequence++);
275       character <<= 6;
276     case 1:
277       character += static_cast<unsigned char>(*sequence++);
278   }
279 
280   return character - offsetsFromUTF8[length - 1];
281 }
282 
convertUTF8ToUTF16(const char ** sourceStart,const char * sourceEnd,UChar ** targetStart,UChar * targetEnd,bool * sourceAllASCII,bool strict)283 ConversionResult convertUTF8ToUTF16(const char** sourceStart,
284                                     const char* sourceEnd, UChar** targetStart,
285                                     UChar* targetEnd, bool* sourceAllASCII,
286                                     bool strict) {
287   ConversionResult result = conversionOK;
288   const char* source = *sourceStart;
289   UChar* target = *targetStart;
290   UChar orAllData = 0;
291   while (source < sourceEnd) {
292     int utf8SequenceLength = inlineUTF8SequenceLength(*source);
293     if (sourceEnd - source < utf8SequenceLength) {
294       result = sourceExhausted;
295       break;
296     }
297     // Do this check whether lenient or strict
298     if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source),
299                      utf8SequenceLength)) {
300       result = sourceIllegal;
301       break;
302     }
303 
304     UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
305 
306     if (target >= targetEnd) {
307       source -= utf8SequenceLength;  // Back up source pointer!
308       result = targetExhausted;
309       break;
310     }
311 
312     if (U_IS_BMP(character)) {
313       // UTF-16 surrogate values are illegal in UTF-32
314       if (U_IS_SURROGATE(character)) {
315         if (strict) {
316           source -= utf8SequenceLength;  // return to the illegal value itself
317           result = sourceIllegal;
318           break;
319         }
320         *target++ = replacementCharacter;
321         orAllData |= replacementCharacter;
322       } else {
323         *target++ = static_cast<UChar>(character);  // normal case
324         orAllData |= character;
325       }
326     } else if (U_IS_SUPPLEMENTARY(character)) {
327       // target is a character in range 0xFFFF - 0x10FFFF
328       if (target + 1 >= targetEnd) {
329         source -= utf8SequenceLength;  // Back up source pointer!
330         result = targetExhausted;
331         break;
332       }
333       *target++ = U16_LEAD(character);
334       *target++ = U16_TRAIL(character);
335       orAllData = 0xffff;
336     } else {
337       if (strict) {
338         source -= utf8SequenceLength;  // return to the start
339         result = sourceIllegal;
340         break;  // Bail out; shouldn't continue
341       } else {
342         *target++ = replacementCharacter;
343         orAllData |= replacementCharacter;
344       }
345     }
346   }
347   *sourceStart = source;
348   *targetStart = target;
349 
350   if (sourceAllASCII) *sourceAllASCII = !(orAllData & ~0x7f);
351 
352   return result;
353 }
354 
355 // Helper to write a three-byte UTF-8 code point to the buffer, caller must
356 // check room is available.
putUTF8Triple(char * & buffer,UChar ch)357 static inline void putUTF8Triple(char*& buffer, UChar ch) {
358   *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0);
359   *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80);
360   *buffer++ = static_cast<char>((ch & 0x3F) | 0x80);
361 }
362 
363 }  // namespace
364 
365 // static
fromInteger(int number)366 String16 String16::fromInteger(int number) {
367   char arr[50];
368   v8::internal::Vector<char> buffer(arr, arraysize(arr));
369   return String16(IntToCString(number, buffer));
370 }
371 
372 // static
fromInteger(size_t number)373 String16 String16::fromInteger(size_t number) {
374   const size_t kBufferSize = 50;
375   char buffer[kBufferSize];
376 #if !defined(_WIN32) && !defined(_WIN64)
377   v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number);
378 #else
379   v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number);
380 #endif
381   return String16(buffer);
382 }
383 
384 // static
fromDouble(double number)385 String16 String16::fromDouble(double number) {
386   char arr[50];
387   v8::internal::Vector<char> buffer(arr, arraysize(arr));
388   return String16(DoubleToCString(number, buffer));
389 }
390 
391 // static
fromDouble(double number,int precision)392 String16 String16::fromDouble(double number, int precision) {
393   std::unique_ptr<char[]> str(
394       v8::internal::DoubleToPrecisionCString(number, precision));
395   return String16(str.get());
396 }
397 
toInteger(bool * ok) const398 int String16::toInteger(bool* ok) const {
399   return charactersToInteger(characters16(), length(), ok);
400 }
401 
stripWhiteSpace() const402 String16 String16::stripWhiteSpace() const {
403   if (!length()) return String16();
404 
405   size_t start = 0;
406   size_t end = length() - 1;
407 
408   // skip white space from start
409   while (start <= end && isSpaceOrNewLine(characters16()[start])) ++start;
410 
411   // only white space
412   if (start > end) return String16();
413 
414   // skip white space from end
415   while (end && isSpaceOrNewLine(characters16()[end])) --end;
416 
417   if (!start && end == length() - 1) return *this;
418   return String16(characters16() + start, end + 1 - start);
419 }
420 
String16Builder()421 String16Builder::String16Builder() {}
422 
append(const String16 & s)423 void String16Builder::append(const String16& s) {
424   m_buffer.insert(m_buffer.end(), s.characters16(),
425                   s.characters16() + s.length());
426 }
427 
append(UChar c)428 void String16Builder::append(UChar c) { m_buffer.push_back(c); }
429 
append(char c)430 void String16Builder::append(char c) {
431   UChar u = c;
432   m_buffer.push_back(u);
433 }
434 
append(const UChar * characters,size_t length)435 void String16Builder::append(const UChar* characters, size_t length) {
436   m_buffer.insert(m_buffer.end(), characters, characters + length);
437 }
438 
append(const char * characters,size_t length)439 void String16Builder::append(const char* characters, size_t length) {
440   m_buffer.insert(m_buffer.end(), characters, characters + length);
441 }
442 
appendNumber(int number)443 void String16Builder::appendNumber(int number) {
444   const int kBufferSize = 11;
445   char buffer[kBufferSize];
446   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%d", number);
447   DCHECK_GT(kBufferSize, chars);
448   m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
449 }
450 
appendNumber(size_t number)451 void String16Builder::appendNumber(size_t number) {
452   const int kBufferSize = 20;
453   char buffer[kBufferSize];
454 #if !defined(_WIN32) && !defined(_WIN64)
455   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%zu", number);
456 #else
457   int chars = v8::base::OS::SNPrintF(buffer, kBufferSize, "%Iu", number);
458 #endif
459   DCHECK_GT(kBufferSize, chars);
460   m_buffer.insert(m_buffer.end(), buffer, buffer + chars);
461 }
462 
toString()463 String16 String16Builder::toString() {
464   return String16(m_buffer.data(), m_buffer.size());
465 }
466 
reserveCapacity(size_t capacity)467 void String16Builder::reserveCapacity(size_t capacity) {
468   m_buffer.reserve(capacity);
469 }
470 
fromUTF8(const char * stringStart,size_t length)471 String16 String16::fromUTF8(const char* stringStart, size_t length) {
472   if (!stringStart || !length) return String16();
473 
474   std::vector<UChar> buffer(length);
475   UChar* bufferStart = buffer.data();
476 
477   UChar* bufferCurrent = bufferStart;
478   const char* stringCurrent = stringStart;
479   if (convertUTF8ToUTF16(&stringCurrent, stringStart + length, &bufferCurrent,
480                          bufferCurrent + buffer.size(), 0,
481                          true) != conversionOK)
482     return String16();
483 
484   size_t utf16Length = bufferCurrent - bufferStart;
485   return String16(bufferStart, utf16Length);
486 }
487 
utf8() const488 std::string String16::utf8() const {
489   size_t length = this->length();
490 
491   if (!length) return std::string("");
492 
493   // Allocate a buffer big enough to hold all the characters
494   // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes).
495   // Optimization ideas, if we find this function is hot:
496   //  * We could speculatively create a CStringBuffer to contain 'length'
497   //    characters, and resize if necessary (i.e. if the buffer contains
498   //    non-ascii characters). (Alternatively, scan the buffer first for
499   //    ascii characters, so we know this will be sufficient).
500   //  * We could allocate a CStringBuffer with an appropriate size to
501   //    have a good chance of being able to write the string into the
502   //    buffer without reallocing (say, 1.5 x length).
503   if (length > std::numeric_limits<unsigned>::max() / 3) return std::string();
504   std::vector<char> bufferVector(length * 3);
505   char* buffer = bufferVector.data();
506   const UChar* characters = m_impl.data();
507 
508   ConversionResult result =
509       convertUTF16ToUTF8(&characters, characters + length, &buffer,
510                          buffer + bufferVector.size(), false);
511   DCHECK(
512       result !=
513       targetExhausted);  // (length * 3) should be sufficient for any conversion
514 
515   // Only produced from strict conversion.
516   DCHECK(result != sourceIllegal);
517 
518   // Check for an unconverted high surrogate.
519   if (result == sourceExhausted) {
520     // This should be one unpaired high surrogate. Treat it the same
521     // was as an unpaired high surrogate would have been handled in
522     // the middle of a string with non-strict conversion - which is
523     // to say, simply encode it to UTF-8.
524     DCHECK((characters + 1) == (m_impl.data() + length));
525     DCHECK((*characters >= 0xD800) && (*characters <= 0xDBFF));
526     // There should be room left, since one UChar hasn't been
527     // converted.
528     DCHECK((buffer + 3) <= (buffer + bufferVector.size()));
529     putUTF8Triple(buffer, *characters);
530   }
531 
532   return std::string(bufferVector.data(), buffer - bufferVector.data());
533 }
534 
535 }  // namespace v8_inspector
536