1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 // from google3/strings/strutil.cc
32
33 #include <google/protobuf/stubs/strutil.h>
34 #include <google/protobuf/stubs/mathlimits.h>
35
36 #include <errno.h>
37 #include <float.h> // FLT_DIG and DBL_DIG
38 #include <limits>
39 #include <limits.h>
40 #include <stdio.h>
41 #include <iterator>
42
43 #include <google/protobuf/stubs/stl_util.h>
44
45 #ifdef _WIN32
46 // MSVC has only _snprintf, not snprintf.
47 //
48 // MinGW has both snprintf and _snprintf, but they appear to be different
49 // functions. The former is buggy. When invoked like so:
50 // char buffer[32];
51 // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
52 // it prints "1.23000e+10". This is plainly wrong: %g should never print
53 // trailing zeros after the decimal point. For some reason this bug only
54 // occurs with some input values, not all. In any case, _snprintf does the
55 // right thing, so we use it.
56 #define snprintf _snprintf
57 #endif
58
59 namespace google {
60 namespace protobuf {
61
62 // These are defined as macros on some platforms. #undef them so that we can
63 // redefine them.
64 #undef isxdigit
65 #undef isprint
66
67 // The definitions of these in ctype.h change based on locale. Since our
68 // string manipulation is all in relation to the protocol buffer and C++
69 // languages, we always want to use the C locale. So, we re-define these
70 // exactly as we want them.
isxdigit(char c)71 inline bool isxdigit(char c) {
72 return ('0' <= c && c <= '9') ||
73 ('a' <= c && c <= 'f') ||
74 ('A' <= c && c <= 'F');
75 }
76
isprint(char c)77 inline bool isprint(char c) {
78 return c >= 0x20 && c <= 0x7E;
79 }
80
81 // ----------------------------------------------------------------------
82 // StripString
83 // Replaces any occurrence of the character 'remove' (or the characters
84 // in 'remove') with the character 'replacewith'.
85 // ----------------------------------------------------------------------
StripString(string * s,const char * remove,char replacewith)86 void StripString(string* s, const char* remove, char replacewith) {
87 const char * str_start = s->c_str();
88 const char * str = str_start;
89 for (str = strpbrk(str, remove);
90 str != NULL;
91 str = strpbrk(str + 1, remove)) {
92 (*s)[str - str_start] = replacewith;
93 }
94 }
95
StripWhitespace(string * str)96 void StripWhitespace(string* str) {
97 int str_length = str->length();
98
99 // Strip off leading whitespace.
100 int first = 0;
101 while (first < str_length && ascii_isspace(str->at(first))) {
102 ++first;
103 }
104 // If entire string is white space.
105 if (first == str_length) {
106 str->clear();
107 return;
108 }
109 if (first > 0) {
110 str->erase(0, first);
111 str_length -= first;
112 }
113
114 // Strip off trailing whitespace.
115 int last = str_length - 1;
116 while (last >= 0 && ascii_isspace(str->at(last))) {
117 --last;
118 }
119 if (last != (str_length - 1) && last >= 0) {
120 str->erase(last + 1, string::npos);
121 }
122 }
123
124 // ----------------------------------------------------------------------
125 // StringReplace()
126 // Replace the "old" pattern with the "new" pattern in a string,
127 // and append the result to "res". If replace_all is false,
128 // it only replaces the first instance of "old."
129 // ----------------------------------------------------------------------
130
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all,string * res)131 void StringReplace(const string& s, const string& oldsub,
132 const string& newsub, bool replace_all,
133 string* res) {
134 if (oldsub.empty()) {
135 res->append(s); // if empty, append the given string.
136 return;
137 }
138
139 string::size_type start_pos = 0;
140 string::size_type pos;
141 do {
142 pos = s.find(oldsub, start_pos);
143 if (pos == string::npos) {
144 break;
145 }
146 res->append(s, start_pos, pos - start_pos);
147 res->append(newsub);
148 start_pos = pos + oldsub.size(); // start searching again after the "old"
149 } while (replace_all);
150 res->append(s, start_pos, s.length() - start_pos);
151 }
152
153 // ----------------------------------------------------------------------
154 // StringReplace()
155 // Give me a string and two patterns "old" and "new", and I replace
156 // the first instance of "old" in the string with "new", if it
157 // exists. If "global" is true; call this repeatedly until it
158 // fails. RETURN a new string, regardless of whether the replacement
159 // happened or not.
160 // ----------------------------------------------------------------------
161
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all)162 string StringReplace(const string& s, const string& oldsub,
163 const string& newsub, bool replace_all) {
164 string ret;
165 StringReplace(s, oldsub, newsub, replace_all, &ret);
166 return ret;
167 }
168
169 // ----------------------------------------------------------------------
170 // SplitStringUsing()
171 // Split a string using a character delimiter. Append the components
172 // to 'result'.
173 //
174 // Note: For multi-character delimiters, this routine will split on *ANY* of
175 // the characters in the string, not the entire string as a single delimiter.
176 // ----------------------------------------------------------------------
177 template <typename ITR>
178 static inline
SplitStringToIteratorUsing(const string & full,const char * delim,ITR & result)179 void SplitStringToIteratorUsing(const string& full,
180 const char* delim,
181 ITR& result) {
182 // Optimize the common case where delim is a single character.
183 if (delim[0] != '\0' && delim[1] == '\0') {
184 char c = delim[0];
185 const char* p = full.data();
186 const char* end = p + full.size();
187 while (p != end) {
188 if (*p == c) {
189 ++p;
190 } else {
191 const char* start = p;
192 while (++p != end && *p != c);
193 *result++ = string(start, p - start);
194 }
195 }
196 return;
197 }
198
199 string::size_type begin_index, end_index;
200 begin_index = full.find_first_not_of(delim);
201 while (begin_index != string::npos) {
202 end_index = full.find_first_of(delim, begin_index);
203 if (end_index == string::npos) {
204 *result++ = full.substr(begin_index);
205 return;
206 }
207 *result++ = full.substr(begin_index, (end_index - begin_index));
208 begin_index = full.find_first_not_of(delim, end_index);
209 }
210 }
211
SplitStringUsing(const string & full,const char * delim,vector<string> * result)212 void SplitStringUsing(const string& full,
213 const char* delim,
214 vector<string>* result) {
215 back_insert_iterator< vector<string> > it(*result);
216 SplitStringToIteratorUsing(full, delim, it);
217 }
218
219 // Split a string using a character delimiter. Append the components
220 // to 'result'. If there are consecutive delimiters, this function
221 // will return corresponding empty strings. The string is split into
222 // at most the specified number of pieces greedily. This means that the
223 // last piece may possibly be split further. To split into as many pieces
224 // as possible, specify 0 as the number of pieces.
225 //
226 // If "full" is the empty string, yields an empty string as the only value.
227 //
228 // If "pieces" is negative for some reason, it returns the whole string
229 // ----------------------------------------------------------------------
230 template <typename StringType, typename ITR>
231 static inline
SplitStringToIteratorAllowEmpty(const StringType & full,const char * delim,int pieces,ITR & result)232 void SplitStringToIteratorAllowEmpty(const StringType& full,
233 const char* delim,
234 int pieces,
235 ITR& result) {
236 string::size_type begin_index, end_index;
237 begin_index = 0;
238
239 for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
240 end_index = full.find_first_of(delim, begin_index);
241 if (end_index == string::npos) {
242 *result++ = full.substr(begin_index);
243 return;
244 }
245 *result++ = full.substr(begin_index, (end_index - begin_index));
246 begin_index = end_index + 1;
247 }
248 *result++ = full.substr(begin_index);
249 }
250
SplitStringAllowEmpty(const string & full,const char * delim,vector<string> * result)251 void SplitStringAllowEmpty(const string& full, const char* delim,
252 vector<string>* result) {
253 back_insert_iterator<vector<string> > it(*result);
254 SplitStringToIteratorAllowEmpty(full, delim, 0, it);
255 }
256
257 // ----------------------------------------------------------------------
258 // JoinStrings()
259 // This merges a vector of string components with delim inserted
260 // as separaters between components.
261 //
262 // ----------------------------------------------------------------------
263 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,string * result)264 static void JoinStringsIterator(const ITERATOR& start,
265 const ITERATOR& end,
266 const char* delim,
267 string* result) {
268 GOOGLE_CHECK(result != NULL);
269 result->clear();
270 int delim_length = strlen(delim);
271
272 // Precompute resulting length so we can reserve() memory in one shot.
273 int length = 0;
274 for (ITERATOR iter = start; iter != end; ++iter) {
275 if (iter != start) {
276 length += delim_length;
277 }
278 length += iter->size();
279 }
280 result->reserve(length);
281
282 // Now combine everything.
283 for (ITERATOR iter = start; iter != end; ++iter) {
284 if (iter != start) {
285 result->append(delim, delim_length);
286 }
287 result->append(iter->data(), iter->size());
288 }
289 }
290
JoinStrings(const vector<string> & components,const char * delim,string * result)291 void JoinStrings(const vector<string>& components,
292 const char* delim,
293 string * result) {
294 JoinStringsIterator(components.begin(), components.end(), delim, result);
295 }
296
297 // ----------------------------------------------------------------------
298 // UnescapeCEscapeSequences()
299 // This does all the unescaping that C does: \ooo, \r, \n, etc
300 // Returns length of resulting string.
301 // The implementation of \x parses any positive number of hex digits,
302 // but it is an error if the value requires more than 8 bits, and the
303 // result is truncated to 8 bits.
304 //
305 // The second call stores its errors in a supplied string vector.
306 // If the string vector pointer is NULL, it reports the errors with LOG().
307 // ----------------------------------------------------------------------
308
309 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
310
311 // Protocol buffers doesn't ever care about errors, but I don't want to remove
312 // the code.
313 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
314
UnescapeCEscapeSequences(const char * source,char * dest)315 int UnescapeCEscapeSequences(const char* source, char* dest) {
316 return UnescapeCEscapeSequences(source, dest, NULL);
317 }
318
UnescapeCEscapeSequences(const char * source,char * dest,vector<string> * errors)319 int UnescapeCEscapeSequences(const char* source, char* dest,
320 vector<string> *errors) {
321 GOOGLE_DCHECK(errors == NULL) << "Error reporting not implemented.";
322
323 char* d = dest;
324 const char* p = source;
325
326 // Small optimization for case where source = dest and there's no escaping
327 while ( p == d && *p != '\0' && *p != '\\' )
328 p++, d++;
329
330 while (*p != '\0') {
331 if (*p != '\\') {
332 *d++ = *p++;
333 } else {
334 switch ( *++p ) { // skip past the '\\'
335 case '\0':
336 LOG_STRING(ERROR, errors) << "String cannot end with \\";
337 *d = '\0';
338 return d - dest; // we're done with p
339 case 'a': *d++ = '\a'; break;
340 case 'b': *d++ = '\b'; break;
341 case 'f': *d++ = '\f'; break;
342 case 'n': *d++ = '\n'; break;
343 case 'r': *d++ = '\r'; break;
344 case 't': *d++ = '\t'; break;
345 case 'v': *d++ = '\v'; break;
346 case '\\': *d++ = '\\'; break;
347 case '?': *d++ = '\?'; break; // \? Who knew?
348 case '\'': *d++ = '\''; break;
349 case '"': *d++ = '\"'; break;
350 case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits
351 case '4': case '5': case '6': case '7': {
352 char ch = *p - '0';
353 if ( IS_OCTAL_DIGIT(p[1]) )
354 ch = ch * 8 + *++p - '0';
355 if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice
356 ch = ch * 8 + *++p - '0'; // now points at last digit
357 *d++ = ch;
358 break;
359 }
360 case 'x': case 'X': {
361 if (!isxdigit(p[1])) {
362 if (p[1] == '\0') {
363 LOG_STRING(ERROR, errors) << "String cannot end with \\x";
364 } else {
365 LOG_STRING(ERROR, errors) <<
366 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
367 }
368 break;
369 }
370 unsigned int ch = 0;
371 const char *hex_start = p;
372 while (isxdigit(p[1])) // arbitrarily many hex digits
373 ch = (ch << 4) + hex_digit_to_int(*++p);
374 if (ch > 0xFF)
375 LOG_STRING(ERROR, errors) << "Value of " <<
376 "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
377 *d++ = ch;
378 break;
379 }
380 #if 0 // TODO(kenton): Support \u and \U? Requires runetochar().
381 case 'u': {
382 // \uhhhh => convert 4 hex digits to UTF-8
383 char32 rune = 0;
384 const char *hex_start = p;
385 for (int i = 0; i < 4; ++i) {
386 if (isxdigit(p[1])) { // Look one char ahead.
387 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
388 } else {
389 LOG_STRING(ERROR, errors)
390 << "\\u must be followed by 4 hex digits: \\"
391 << string(hex_start, p+1-hex_start);
392 break;
393 }
394 }
395 d += runetochar(d, &rune);
396 break;
397 }
398 case 'U': {
399 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
400 char32 rune = 0;
401 const char *hex_start = p;
402 for (int i = 0; i < 8; ++i) {
403 if (isxdigit(p[1])) { // Look one char ahead.
404 // Don't change rune until we're sure this
405 // is within the Unicode limit, but do advance p.
406 char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
407 if (newrune > 0x10FFFF) {
408 LOG_STRING(ERROR, errors)
409 << "Value of \\"
410 << string(hex_start, p + 1 - hex_start)
411 << " exceeds Unicode limit (0x10FFFF)";
412 break;
413 } else {
414 rune = newrune;
415 }
416 } else {
417 LOG_STRING(ERROR, errors)
418 << "\\U must be followed by 8 hex digits: \\"
419 << string(hex_start, p+1-hex_start);
420 break;
421 }
422 }
423 d += runetochar(d, &rune);
424 break;
425 }
426 #endif
427 default:
428 LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
429 }
430 p++; // read past letter we escaped
431 }
432 }
433 *d = '\0';
434 return d - dest;
435 }
436
437 // ----------------------------------------------------------------------
438 // UnescapeCEscapeString()
439 // This does the same thing as UnescapeCEscapeSequences, but creates
440 // a new string. The caller does not need to worry about allocating
441 // a dest buffer. This should be used for non performance critical
442 // tasks such as printing debug messages. It is safe for src and dest
443 // to be the same.
444 //
445 // The second call stores its errors in a supplied string vector.
446 // If the string vector pointer is NULL, it reports the errors with LOG().
447 //
448 // In the first and second calls, the length of dest is returned. In the
449 // the third call, the new string is returned.
450 // ----------------------------------------------------------------------
UnescapeCEscapeString(const string & src,string * dest)451 int UnescapeCEscapeString(const string& src, string* dest) {
452 return UnescapeCEscapeString(src, dest, NULL);
453 }
454
UnescapeCEscapeString(const string & src,string * dest,vector<string> * errors)455 int UnescapeCEscapeString(const string& src, string* dest,
456 vector<string> *errors) {
457 scoped_array<char> unescaped(new char[src.size() + 1]);
458 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
459 GOOGLE_CHECK(dest);
460 dest->assign(unescaped.get(), len);
461 return len;
462 }
463
UnescapeCEscapeString(const string & src)464 string UnescapeCEscapeString(const string& src) {
465 scoped_array<char> unescaped(new char[src.size() + 1]);
466 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), NULL);
467 return string(unescaped.get(), len);
468 }
469
470 // ----------------------------------------------------------------------
471 // CEscapeString()
472 // CHexEscapeString()
473 // Copies 'src' to 'dest', escaping dangerous characters using
474 // C-style escape sequences. This is very useful for preparing query
475 // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
476 // hexadecimal rather than octal sequences.
477 // Returns the number of bytes written to 'dest' (not including the \0)
478 // or -1 if there was insufficient space.
479 //
480 // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
481 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)482 int CEscapeInternal(const char* src, int src_len, char* dest,
483 int dest_len, bool use_hex, bool utf8_safe) {
484 const char* src_end = src + src_len;
485 int used = 0;
486 bool last_hex_escape = false; // true if last output char was \xNN
487
488 for (; src < src_end; src++) {
489 if (dest_len - used < 2) // Need space for two letter escape
490 return -1;
491
492 bool is_hex_escape = false;
493 switch (*src) {
494 case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
495 case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
496 case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
497 case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
498 case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
499 case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
500 default:
501 // Note that if we emit \xNN and the src character after that is a hex
502 // digit then that digit must be escaped too to prevent it being
503 // interpreted as part of the character code by C.
504 if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
505 (!isprint(*src) ||
506 (last_hex_escape && isxdigit(*src)))) {
507 if (dest_len - used < 4) // need space for 4 letter escape
508 return -1;
509 sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
510 static_cast<uint8>(*src));
511 is_hex_escape = use_hex;
512 used += 4;
513 } else {
514 dest[used++] = *src; break;
515 }
516 }
517 last_hex_escape = is_hex_escape;
518 }
519
520 if (dest_len - used < 1) // make sure that there is room for \0
521 return -1;
522
523 dest[used] = '\0'; // doesn't count towards return value though
524 return used;
525 }
526
527 // Calculates the length of the C-style escaped version of 'src'.
528 // Assumes that non-printable characters are escaped using octal sequences, and
529 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)530 static inline size_t CEscapedLength(StringPiece src) {
531 static char c_escaped_len[256] = {
532 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
533 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
534 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
540 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
541 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
542 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
543 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
544 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
545 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
546 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
547 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
548 };
549
550 size_t escaped_len = 0;
551 for (int i = 0; i < src.size(); ++i) {
552 unsigned char c = static_cast<unsigned char>(src[i]);
553 escaped_len += c_escaped_len[c];
554 }
555 return escaped_len;
556 }
557
558 // ----------------------------------------------------------------------
559 // Escapes 'src' using C-style escape sequences, and appends the escaped string
560 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
561 // the required space using a lookup table, and also does not do any special
562 // handling for Hex or UTF-8 characters.
563 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,string * dest)564 void CEscapeAndAppend(StringPiece src, string* dest) {
565 size_t escaped_len = CEscapedLength(src);
566 if (escaped_len == src.size()) {
567 dest->append(src.data(), src.size());
568 return;
569 }
570
571 size_t cur_dest_len = dest->size();
572 dest->resize(cur_dest_len + escaped_len);
573 char* append_ptr = &(*dest)[cur_dest_len];
574
575 for (int i = 0; i < src.size(); ++i) {
576 unsigned char c = static_cast<unsigned char>(src[i]);
577 switch (c) {
578 case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
579 case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
580 case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
581 case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
582 case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
583 case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
584 default:
585 if (!isprint(c)) {
586 *append_ptr++ = '\\';
587 *append_ptr++ = '0' + c / 64;
588 *append_ptr++ = '0' + (c % 64) / 8;
589 *append_ptr++ = '0' + c % 8;
590 } else {
591 *append_ptr++ = c;
592 }
593 break;
594 }
595 }
596 }
597
CEscape(const string & src)598 string CEscape(const string& src) {
599 string dest;
600 CEscapeAndAppend(src, &dest);
601 return dest;
602 }
603
604 namespace strings {
605
Utf8SafeCEscape(const string & src)606 string Utf8SafeCEscape(const string& src) {
607 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
608 scoped_array<char> dest(new char[dest_length]);
609 const int len = CEscapeInternal(src.data(), src.size(),
610 dest.get(), dest_length, false, true);
611 GOOGLE_DCHECK_GE(len, 0);
612 return string(dest.get(), len);
613 }
614
CHexEscape(const string & src)615 string CHexEscape(const string& src) {
616 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
617 scoped_array<char> dest(new char[dest_length]);
618 const int len = CEscapeInternal(src.data(), src.size(),
619 dest.get(), dest_length, true, false);
620 GOOGLE_DCHECK_GE(len, 0);
621 return string(dest.get(), len);
622 }
623
624 } // namespace strings
625
626 // ----------------------------------------------------------------------
627 // strto32_adaptor()
628 // strtou32_adaptor()
629 // Implementation of strto[u]l replacements that have identical
630 // overflow and underflow characteristics for both ILP-32 and LP-64
631 // platforms, including errno preservation in error-free calls.
632 // ----------------------------------------------------------------------
633
strto32_adaptor(const char * nptr,char ** endptr,int base)634 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
635 const int saved_errno = errno;
636 errno = 0;
637 const long result = strtol(nptr, endptr, base);
638 if (errno == ERANGE && result == LONG_MIN) {
639 return kint32min;
640 } else if (errno == ERANGE && result == LONG_MAX) {
641 return kint32max;
642 } else if (errno == 0 && result < kint32min) {
643 errno = ERANGE;
644 return kint32min;
645 } else if (errno == 0 && result > kint32max) {
646 errno = ERANGE;
647 return kint32max;
648 }
649 if (errno == 0)
650 errno = saved_errno;
651 return static_cast<int32>(result);
652 }
653
strtou32_adaptor(const char * nptr,char ** endptr,int base)654 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
655 const int saved_errno = errno;
656 errno = 0;
657 const unsigned long result = strtoul(nptr, endptr, base);
658 if (errno == ERANGE && result == ULONG_MAX) {
659 return kuint32max;
660 } else if (errno == 0 && result > kuint32max) {
661 errno = ERANGE;
662 return kuint32max;
663 }
664 if (errno == 0)
665 errno = saved_errno;
666 return static_cast<uint32>(result);
667 }
668
safe_parse_sign(string * text,bool * negative_ptr)669 inline bool safe_parse_sign(string* text /*inout*/,
670 bool* negative_ptr /*output*/) {
671 const char* start = text->data();
672 const char* end = start + text->size();
673
674 // Consume whitespace.
675 while (start < end && (start[0] == ' ')) {
676 ++start;
677 }
678 while (start < end && (end[-1] == ' ')) {
679 --end;
680 }
681 if (start >= end) {
682 return false;
683 }
684
685 // Consume sign.
686 *negative_ptr = (start[0] == '-');
687 if (*negative_ptr || start[0] == '+') {
688 ++start;
689 if (start >= end) {
690 return false;
691 }
692 }
693 *text = text->substr(start - text->data(), end - start);
694 return true;
695 }
696
697 template<typename IntType>
safe_parse_positive_int(string text,IntType * value_p)698 bool safe_parse_positive_int(
699 string text, IntType* value_p) {
700 int base = 10;
701 IntType value = 0;
702 const IntType vmax = std::numeric_limits<IntType>::max();
703 assert(vmax > 0);
704 assert(vmax >= base);
705 const IntType vmax_over_base = vmax / base;
706 const char* start = text.data();
707 const char* end = start + text.size();
708 // loop over digits
709 for (; start < end; ++start) {
710 unsigned char c = static_cast<unsigned char>(start[0]);
711 int digit = c - '0';
712 if (digit >= base || digit < 0) {
713 *value_p = value;
714 return false;
715 }
716 if (value > vmax_over_base) {
717 *value_p = vmax;
718 return false;
719 }
720 value *= base;
721 if (value > vmax - digit) {
722 *value_p = vmax;
723 return false;
724 }
725 value += digit;
726 }
727 *value_p = value;
728 return true;
729 }
730
731 template<typename IntType>
safe_parse_negative_int(const string & text,IntType * value_p)732 bool safe_parse_negative_int(
733 const string& text, IntType* value_p) {
734 int base = 10;
735 IntType value = 0;
736 const IntType vmin = std::numeric_limits<IntType>::min();
737 assert(vmin < 0);
738 assert(vmin <= 0 - base);
739 IntType vmin_over_base = vmin / base;
740 // 2003 c++ standard [expr.mul]
741 // "... the sign of the remainder is implementation-defined."
742 // Although (vmin/base)*base + vmin%base is always vmin.
743 // 2011 c++ standard tightens the spec but we cannot rely on it.
744 if (vmin % base > 0) {
745 vmin_over_base += 1;
746 }
747 const char* start = text.data();
748 const char* end = start + text.size();
749 // loop over digits
750 for (; start < end; ++start) {
751 unsigned char c = static_cast<unsigned char>(start[0]);
752 int digit = c - '0';
753 if (digit >= base || digit < 0) {
754 *value_p = value;
755 return false;
756 }
757 if (value < vmin_over_base) {
758 *value_p = vmin;
759 return false;
760 }
761 value *= base;
762 if (value < vmin + digit) {
763 *value_p = vmin;
764 return false;
765 }
766 value -= digit;
767 }
768 *value_p = value;
769 return true;
770 }
771
772 template<typename IntType>
safe_int_internal(string text,IntType * value_p)773 bool safe_int_internal(string text, IntType* value_p) {
774 *value_p = 0;
775 bool negative;
776 if (!safe_parse_sign(&text, &negative)) {
777 return false;
778 }
779 if (!negative) {
780 return safe_parse_positive_int(text, value_p);
781 } else {
782 return safe_parse_negative_int(text, value_p);
783 }
784 }
785
786 template<typename IntType>
safe_uint_internal(string text,IntType * value_p)787 bool safe_uint_internal(string text, IntType* value_p) {
788 *value_p = 0;
789 bool negative;
790 if (!safe_parse_sign(&text, &negative) || negative) {
791 return false;
792 }
793 return safe_parse_positive_int(text, value_p);
794 }
795
796 // ----------------------------------------------------------------------
797 // FastIntToBuffer()
798 // FastInt64ToBuffer()
799 // FastHexToBuffer()
800 // FastHex64ToBuffer()
801 // FastHex32ToBuffer()
802 // ----------------------------------------------------------------------
803
804 // Offset into buffer where FastInt64ToBuffer places the end of string
805 // null character. Also used by FastInt64ToBufferLeft.
806 static const int kFastInt64ToBufferOffset = 21;
807
FastInt64ToBuffer(int64 i,char * buffer)808 char *FastInt64ToBuffer(int64 i, char* buffer) {
809 // We could collapse the positive and negative sections, but that
810 // would be slightly slower for positive numbers...
811 // 22 bytes is enough to store -2**64, -18446744073709551616.
812 char* p = buffer + kFastInt64ToBufferOffset;
813 *p-- = '\0';
814 if (i >= 0) {
815 do {
816 *p-- = '0' + i % 10;
817 i /= 10;
818 } while (i > 0);
819 return p + 1;
820 } else {
821 // On different platforms, % and / have different behaviors for
822 // negative numbers, so we need to jump through hoops to make sure
823 // we don't divide negative numbers.
824 if (i > -10) {
825 i = -i;
826 *p-- = '0' + i;
827 *p = '-';
828 return p;
829 } else {
830 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
831 i = i + 10;
832 i = -i;
833 *p-- = '0' + i % 10;
834 // Undo what we did a moment ago
835 i = i / 10 + 1;
836 do {
837 *p-- = '0' + i % 10;
838 i /= 10;
839 } while (i > 0);
840 *p = '-';
841 return p;
842 }
843 }
844 }
845
846 // Offset into buffer where FastInt32ToBuffer places the end of string
847 // null character. Also used by FastInt32ToBufferLeft
848 static const int kFastInt32ToBufferOffset = 11;
849
850 // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the
851 // compiler to generate 32 bit arithmetic instructions. It's much faster, at
852 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)853 char *FastInt32ToBuffer(int32 i, char* buffer) {
854 // We could collapse the positive and negative sections, but that
855 // would be slightly slower for positive numbers...
856 // 12 bytes is enough to store -2**32, -4294967296.
857 char* p = buffer + kFastInt32ToBufferOffset;
858 *p-- = '\0';
859 if (i >= 0) {
860 do {
861 *p-- = '0' + i % 10;
862 i /= 10;
863 } while (i > 0);
864 return p + 1;
865 } else {
866 // On different platforms, % and / have different behaviors for
867 // negative numbers, so we need to jump through hoops to make sure
868 // we don't divide negative numbers.
869 if (i > -10) {
870 i = -i;
871 *p-- = '0' + i;
872 *p = '-';
873 return p;
874 } else {
875 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
876 i = i + 10;
877 i = -i;
878 *p-- = '0' + i % 10;
879 // Undo what we did a moment ago
880 i = i / 10 + 1;
881 do {
882 *p-- = '0' + i % 10;
883 i /= 10;
884 } while (i > 0);
885 *p = '-';
886 return p;
887 }
888 }
889 }
890
FastHexToBuffer(int i,char * buffer)891 char *FastHexToBuffer(int i, char* buffer) {
892 GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
893
894 static const char *hexdigits = "0123456789abcdef";
895 char *p = buffer + 21;
896 *p-- = '\0';
897 do {
898 *p-- = hexdigits[i & 15]; // mod by 16
899 i >>= 4; // divide by 16
900 } while (i > 0);
901 return p + 1;
902 }
903
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)904 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
905 static const char *hexdigits = "0123456789abcdef";
906 buffer[num_byte] = '\0';
907 for (int i = num_byte - 1; i >= 0; i--) {
908 #ifdef _M_X64
909 // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
910 // block. Given that the uint32 cast was to improve performance on 32-bit
911 // platforms, we use 64-bit '&' directly.
912 buffer[i] = hexdigits[value & 0xf];
913 #else
914 buffer[i] = hexdigits[uint32(value) & 0xf];
915 #endif
916 value >>= 4;
917 }
918 return buffer;
919 }
920
FastHex64ToBuffer(uint64 value,char * buffer)921 char *FastHex64ToBuffer(uint64 value, char* buffer) {
922 return InternalFastHexToBuffer(value, buffer, 16);
923 }
924
FastHex32ToBuffer(uint32 value,char * buffer)925 char *FastHex32ToBuffer(uint32 value, char* buffer) {
926 return InternalFastHexToBuffer(value, buffer, 8);
927 }
928
929 // ----------------------------------------------------------------------
930 // FastInt32ToBufferLeft()
931 // FastUInt32ToBufferLeft()
932 // FastInt64ToBufferLeft()
933 // FastUInt64ToBufferLeft()
934 //
935 // Like the Fast*ToBuffer() functions above, these are intended for speed.
936 // Unlike the Fast*ToBuffer() functions, however, these functions write
937 // their output to the beginning of the buffer (hence the name, as the
938 // output is left-aligned). The caller is responsible for ensuring that
939 // the buffer has enough space to hold the output.
940 //
941 // Returns a pointer to the end of the string (i.e. the null character
942 // terminating the string).
943 // ----------------------------------------------------------------------
944
945 static const char two_ASCII_digits[100][2] = {
946 {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
947 {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
948 {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
949 {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
950 {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
951 {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
952 {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
953 {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
954 {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
955 {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
956 {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
957 {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
958 {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
959 {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
960 {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
961 {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
962 {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
963 {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
964 {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
965 {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
966 };
967
FastUInt32ToBufferLeft(uint32 u,char * buffer)968 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
969 int digits;
970 const char *ASCII_digits = NULL;
971 // The idea of this implementation is to trim the number of divides to as few
972 // as possible by using multiplication and subtraction rather than mod (%),
973 // and by outputting two digits at a time rather than one.
974 // The huge-number case is first, in the hopes that the compiler will output
975 // that case in one branch-free block of code, and only output conditional
976 // branches into it from below.
977 if (u >= 1000000000) { // >= 1,000,000,000
978 digits = u / 100000000; // 100,000,000
979 ASCII_digits = two_ASCII_digits[digits];
980 buffer[0] = ASCII_digits[0];
981 buffer[1] = ASCII_digits[1];
982 buffer += 2;
983 sublt100_000_000:
984 u -= digits * 100000000; // 100,000,000
985 lt100_000_000:
986 digits = u / 1000000; // 1,000,000
987 ASCII_digits = two_ASCII_digits[digits];
988 buffer[0] = ASCII_digits[0];
989 buffer[1] = ASCII_digits[1];
990 buffer += 2;
991 sublt1_000_000:
992 u -= digits * 1000000; // 1,000,000
993 lt1_000_000:
994 digits = u / 10000; // 10,000
995 ASCII_digits = two_ASCII_digits[digits];
996 buffer[0] = ASCII_digits[0];
997 buffer[1] = ASCII_digits[1];
998 buffer += 2;
999 sublt10_000:
1000 u -= digits * 10000; // 10,000
1001 lt10_000:
1002 digits = u / 100;
1003 ASCII_digits = two_ASCII_digits[digits];
1004 buffer[0] = ASCII_digits[0];
1005 buffer[1] = ASCII_digits[1];
1006 buffer += 2;
1007 sublt100:
1008 u -= digits * 100;
1009 lt100:
1010 digits = u;
1011 ASCII_digits = two_ASCII_digits[digits];
1012 buffer[0] = ASCII_digits[0];
1013 buffer[1] = ASCII_digits[1];
1014 buffer += 2;
1015 done:
1016 *buffer = 0;
1017 return buffer;
1018 }
1019
1020 if (u < 100) {
1021 digits = u;
1022 if (u >= 10) goto lt100;
1023 *buffer++ = '0' + digits;
1024 goto done;
1025 }
1026 if (u < 10000) { // 10,000
1027 if (u >= 1000) goto lt10_000;
1028 digits = u / 100;
1029 *buffer++ = '0' + digits;
1030 goto sublt100;
1031 }
1032 if (u < 1000000) { // 1,000,000
1033 if (u >= 100000) goto lt1_000_000;
1034 digits = u / 10000; // 10,000
1035 *buffer++ = '0' + digits;
1036 goto sublt10_000;
1037 }
1038 if (u < 100000000) { // 100,000,000
1039 if (u >= 10000000) goto lt100_000_000;
1040 digits = u / 1000000; // 1,000,000
1041 *buffer++ = '0' + digits;
1042 goto sublt1_000_000;
1043 }
1044 // we already know that u < 1,000,000,000
1045 digits = u / 100000000; // 100,000,000
1046 *buffer++ = '0' + digits;
1047 goto sublt100_000_000;
1048 }
1049
FastInt32ToBufferLeft(int32 i,char * buffer)1050 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1051 uint32 u = i;
1052 if (i < 0) {
1053 *buffer++ = '-';
1054 u = -i;
1055 }
1056 return FastUInt32ToBufferLeft(u, buffer);
1057 }
1058
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1059 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1060 int digits;
1061 const char *ASCII_digits = NULL;
1062
1063 uint32 u = static_cast<uint32>(u64);
1064 if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1065
1066 uint64 top_11_digits = u64 / 1000000000;
1067 buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1068 u = u64 - (top_11_digits * 1000000000);
1069
1070 digits = u / 10000000; // 10,000,000
1071 GOOGLE_DCHECK_LT(digits, 100);
1072 ASCII_digits = two_ASCII_digits[digits];
1073 buffer[0] = ASCII_digits[0];
1074 buffer[1] = ASCII_digits[1];
1075 buffer += 2;
1076 u -= digits * 10000000; // 10,000,000
1077 digits = u / 100000; // 100,000
1078 ASCII_digits = two_ASCII_digits[digits];
1079 buffer[0] = ASCII_digits[0];
1080 buffer[1] = ASCII_digits[1];
1081 buffer += 2;
1082 u -= digits * 100000; // 100,000
1083 digits = u / 1000; // 1,000
1084 ASCII_digits = two_ASCII_digits[digits];
1085 buffer[0] = ASCII_digits[0];
1086 buffer[1] = ASCII_digits[1];
1087 buffer += 2;
1088 u -= digits * 1000; // 1,000
1089 digits = u / 10;
1090 ASCII_digits = two_ASCII_digits[digits];
1091 buffer[0] = ASCII_digits[0];
1092 buffer[1] = ASCII_digits[1];
1093 buffer += 2;
1094 u -= digits * 10;
1095 digits = u;
1096 *buffer++ = '0' + digits;
1097 *buffer = 0;
1098 return buffer;
1099 }
1100
FastInt64ToBufferLeft(int64 i,char * buffer)1101 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1102 uint64 u = i;
1103 if (i < 0) {
1104 *buffer++ = '-';
1105 u = -i;
1106 }
1107 return FastUInt64ToBufferLeft(u, buffer);
1108 }
1109
1110 // ----------------------------------------------------------------------
1111 // SimpleItoa()
1112 // Description: converts an integer to a string.
1113 //
1114 // Return value: string
1115 // ----------------------------------------------------------------------
1116
SimpleItoa(int i)1117 string SimpleItoa(int i) {
1118 char buffer[kFastToBufferSize];
1119 return (sizeof(i) == 4) ?
1120 FastInt32ToBuffer(i, buffer) :
1121 FastInt64ToBuffer(i, buffer);
1122 }
1123
SimpleItoa(unsigned int i)1124 string SimpleItoa(unsigned int i) {
1125 char buffer[kFastToBufferSize];
1126 return string(buffer, (sizeof(i) == 4) ?
1127 FastUInt32ToBufferLeft(i, buffer) :
1128 FastUInt64ToBufferLeft(i, buffer));
1129 }
1130
SimpleItoa(long i)1131 string SimpleItoa(long i) {
1132 char buffer[kFastToBufferSize];
1133 return (sizeof(i) == 4) ?
1134 FastInt32ToBuffer(i, buffer) :
1135 FastInt64ToBuffer(i, buffer);
1136 }
1137
SimpleItoa(unsigned long i)1138 string SimpleItoa(unsigned long i) {
1139 char buffer[kFastToBufferSize];
1140 return string(buffer, (sizeof(i) == 4) ?
1141 FastUInt32ToBufferLeft(i, buffer) :
1142 FastUInt64ToBufferLeft(i, buffer));
1143 }
1144
SimpleItoa(long long i)1145 string SimpleItoa(long long i) {
1146 char buffer[kFastToBufferSize];
1147 return (sizeof(i) == 4) ?
1148 FastInt32ToBuffer(i, buffer) :
1149 FastInt64ToBuffer(i, buffer);
1150 }
1151
SimpleItoa(unsigned long long i)1152 string SimpleItoa(unsigned long long i) {
1153 char buffer[kFastToBufferSize];
1154 return string(buffer, (sizeof(i) == 4) ?
1155 FastUInt32ToBufferLeft(i, buffer) :
1156 FastUInt64ToBufferLeft(i, buffer));
1157 }
1158
1159 // ----------------------------------------------------------------------
1160 // SimpleDtoa()
1161 // SimpleFtoa()
1162 // DoubleToBuffer()
1163 // FloatToBuffer()
1164 // We want to print the value without losing precision, but we also do
1165 // not want to print more digits than necessary. This turns out to be
1166 // trickier than it sounds. Numbers like 0.2 cannot be represented
1167 // exactly in binary. If we print 0.2 with a very large precision,
1168 // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1169 // On the other hand, if we set the precision too low, we lose
1170 // significant digits when printing numbers that actually need them.
1171 // It turns out there is no precision value that does the right thing
1172 // for all numbers.
1173 //
1174 // Our strategy is to first try printing with a precision that is never
1175 // over-precise, then parse the result with strtod() to see if it
1176 // matches. If not, we print again with a precision that will always
1177 // give a precise result, but may use more digits than necessary.
1178 //
1179 // An arguably better strategy would be to use the algorithm described
1180 // in "How to Print Floating-Point Numbers Accurately" by Steele &
1181 // White, e.g. as implemented by David M. Gay's dtoa(). It turns out,
1182 // however, that the following implementation is about as fast as
1183 // DMG's code. Furthermore, DMG's code locks mutexes, which means it
1184 // will not scale well on multi-core machines. DMG's code is slightly
1185 // more accurate (in that it will never use more digits than
1186 // necessary), but this is probably irrelevant for most users.
1187 //
1188 // Rob Pike and Ken Thompson also have an implementation of dtoa() in
1189 // third_party/fmt/fltfmt.cc. Their implementation is similar to this
1190 // one in that it makes guesses and then uses strtod() to check them.
1191 // Their implementation is faster because they use their own code to
1192 // generate the digits in the first place rather than use snprintf(),
1193 // thus avoiding format string parsing overhead. However, this makes
1194 // it considerably more complicated than the following implementation,
1195 // and it is embedded in a larger library. If speed turns out to be
1196 // an issue, we could re-implement this in terms of their
1197 // implementation.
1198 // ----------------------------------------------------------------------
1199
SimpleDtoa(double value)1200 string SimpleDtoa(double value) {
1201 char buffer[kDoubleToBufferSize];
1202 return DoubleToBuffer(value, buffer);
1203 }
1204
SimpleFtoa(float value)1205 string SimpleFtoa(float value) {
1206 char buffer[kFloatToBufferSize];
1207 return FloatToBuffer(value, buffer);
1208 }
1209
IsValidFloatChar(char c)1210 static inline bool IsValidFloatChar(char c) {
1211 return ('0' <= c && c <= '9') ||
1212 c == 'e' || c == 'E' ||
1213 c == '+' || c == '-';
1214 }
1215
DelocalizeRadix(char * buffer)1216 void DelocalizeRadix(char* buffer) {
1217 // Fast check: if the buffer has a normal decimal point, assume no
1218 // translation is needed.
1219 if (strchr(buffer, '.') != NULL) return;
1220
1221 // Find the first unknown character.
1222 while (IsValidFloatChar(*buffer)) ++buffer;
1223
1224 if (*buffer == '\0') {
1225 // No radix character found.
1226 return;
1227 }
1228
1229 // We are now pointing at the locale-specific radix character. Replace it
1230 // with '.'.
1231 *buffer = '.';
1232 ++buffer;
1233
1234 if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1235 // It appears the radix was a multi-byte character. We need to remove the
1236 // extra bytes.
1237 char* target = buffer;
1238 do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1239 memmove(target, buffer, strlen(buffer) + 1);
1240 }
1241 }
1242
DoubleToBuffer(double value,char * buffer)1243 char* DoubleToBuffer(double value, char* buffer) {
1244 // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1245 // platforms these days. Just in case some system exists where DBL_DIG
1246 // is significantly larger -- and risks overflowing our buffer -- we have
1247 // this assert.
1248 GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1249
1250 if (value == numeric_limits<double>::infinity()) {
1251 strcpy(buffer, "inf");
1252 return buffer;
1253 } else if (value == -numeric_limits<double>::infinity()) {
1254 strcpy(buffer, "-inf");
1255 return buffer;
1256 } else if (MathLimits<double>::IsNaN(value)) {
1257 strcpy(buffer, "nan");
1258 return buffer;
1259 }
1260
1261 int snprintf_result =
1262 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1263
1264 // The snprintf should never overflow because the buffer is significantly
1265 // larger than the precision we asked for.
1266 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1267
1268 // We need to make parsed_value volatile in order to force the compiler to
1269 // write it out to the stack. Otherwise, it may keep the value in a
1270 // register, and if it does that, it may keep it as a long double instead
1271 // of a double. This long double may have extra bits that make it compare
1272 // unequal to "value" even though it would be exactly equal if it were
1273 // truncated to a double.
1274 volatile double parsed_value = strtod(buffer, NULL);
1275 if (parsed_value != value) {
1276 int snprintf_result =
1277 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1278
1279 // Should never overflow; see above.
1280 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1281 }
1282
1283 DelocalizeRadix(buffer);
1284 return buffer;
1285 }
1286
memcasecmp(const char * s1,const char * s2,size_t len)1287 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1288 const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1289 const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1290
1291 for ( int i = 0; i < len; i++ ) {
1292 const int diff =
1293 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1294 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1295 if (diff != 0) return diff;
1296 }
1297 return 0;
1298 }
1299
CaseEqual(StringPiece s1,StringPiece s2)1300 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1301 if (s1.size() != s2.size()) return false;
1302 return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1303 }
1304
safe_strtob(StringPiece str,bool * value)1305 bool safe_strtob(StringPiece str, bool* value) {
1306 GOOGLE_CHECK(value != NULL) << "NULL output boolean given.";
1307 if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1308 CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1309 CaseEqual(str, "1")) {
1310 *value = true;
1311 return true;
1312 }
1313 if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1314 CaseEqual(str, "no") || CaseEqual(str, "n") ||
1315 CaseEqual(str, "0")) {
1316 *value = false;
1317 return true;
1318 }
1319 return false;
1320 }
1321
safe_strtof(const char * str,float * value)1322 bool safe_strtof(const char* str, float* value) {
1323 char* endptr;
1324 errno = 0; // errno only gets set on errors
1325 #if defined(_WIN32) || defined (__hpux) // has no strtof()
1326 *value = strtod(str, &endptr);
1327 #else
1328 *value = strtof(str, &endptr);
1329 #endif
1330 return *str != 0 && *endptr == 0 && errno == 0;
1331 }
1332
safe_strtod(const char * str,double * value)1333 bool safe_strtod(const char* str, double* value) {
1334 char* endptr;
1335 *value = strtod(str, &endptr);
1336 if (endptr != str) {
1337 while (ascii_isspace(*endptr)) ++endptr;
1338 }
1339 // Ignore range errors from strtod. The values it
1340 // returns on underflow and overflow are the right
1341 // fallback in a robust setting.
1342 return *str != '\0' && *endptr == '\0';
1343 }
1344
safe_strto32(const string & str,int32 * value)1345 bool safe_strto32(const string& str, int32* value) {
1346 return safe_int_internal(str, value);
1347 }
1348
safe_strtou32(const string & str,uint32 * value)1349 bool safe_strtou32(const string& str, uint32* value) {
1350 return safe_uint_internal(str, value);
1351 }
1352
safe_strto64(const string & str,int64 * value)1353 bool safe_strto64(const string& str, int64* value) {
1354 return safe_int_internal(str, value);
1355 }
1356
safe_strtou64(const string & str,uint64 * value)1357 bool safe_strtou64(const string& str, uint64* value) {
1358 return safe_uint_internal(str, value);
1359 }
1360
FloatToBuffer(float value,char * buffer)1361 char* FloatToBuffer(float value, char* buffer) {
1362 // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1363 // platforms these days. Just in case some system exists where FLT_DIG
1364 // is significantly larger -- and risks overflowing our buffer -- we have
1365 // this assert.
1366 GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1367
1368 if (value == numeric_limits<double>::infinity()) {
1369 strcpy(buffer, "inf");
1370 return buffer;
1371 } else if (value == -numeric_limits<double>::infinity()) {
1372 strcpy(buffer, "-inf");
1373 return buffer;
1374 } else if (MathLimits<float>::IsNaN(value)) {
1375 strcpy(buffer, "nan");
1376 return buffer;
1377 }
1378
1379 int snprintf_result =
1380 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1381
1382 // The snprintf should never overflow because the buffer is significantly
1383 // larger than the precision we asked for.
1384 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1385
1386 float parsed_value;
1387 if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1388 int snprintf_result =
1389 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+2, value);
1390
1391 // Should never overflow; see above.
1392 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1393 }
1394
1395 DelocalizeRadix(buffer);
1396 return buffer;
1397 }
1398
1399 namespace strings {
1400
AlphaNum(strings::Hex hex)1401 AlphaNum::AlphaNum(strings::Hex hex) {
1402 char *const end = &digits[kFastToBufferSize];
1403 char *writer = end;
1404 uint64 value = hex.value;
1405 uint64 width = hex.spec;
1406 // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1407 // where 0x10000 is the smallest hex number that is as wide as the user
1408 // asked for.
1409 uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1410 static const char hexdigits[] = "0123456789abcdef";
1411 do {
1412 *--writer = hexdigits[value & 0xF];
1413 value >>= 4;
1414 mask >>= 4;
1415 } while (mask != 0);
1416 piece_data_ = writer;
1417 piece_size_ = end - writer;
1418 }
1419
1420 } // namespace strings
1421
1422 // ----------------------------------------------------------------------
1423 // StrCat()
1424 // This merges the given strings or integers, with no delimiter. This
1425 // is designed to be the fastest possible way to construct a string out
1426 // of a mix of raw C strings, C++ strings, and integer values.
1427 // ----------------------------------------------------------------------
1428
1429 // Append is merely a version of memcpy that returns the address of the byte
1430 // after the area just overwritten. It comes in multiple flavors to minimize
1431 // call overhead.
Append1(char * out,const AlphaNum & x)1432 static char *Append1(char *out, const AlphaNum &x) {
1433 memcpy(out, x.data(), x.size());
1434 return out + x.size();
1435 }
1436
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1437 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1438 memcpy(out, x1.data(), x1.size());
1439 out += x1.size();
1440
1441 memcpy(out, x2.data(), x2.size());
1442 return out + x2.size();
1443 }
1444
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1445 static char *Append4(char *out,
1446 const AlphaNum &x1, const AlphaNum &x2,
1447 const AlphaNum &x3, const AlphaNum &x4) {
1448 memcpy(out, x1.data(), x1.size());
1449 out += x1.size();
1450
1451 memcpy(out, x2.data(), x2.size());
1452 out += x2.size();
1453
1454 memcpy(out, x3.data(), x3.size());
1455 out += x3.size();
1456
1457 memcpy(out, x4.data(), x4.size());
1458 return out + x4.size();
1459 }
1460
StrCat(const AlphaNum & a,const AlphaNum & b)1461 string StrCat(const AlphaNum &a, const AlphaNum &b) {
1462 string result;
1463 result.resize(a.size() + b.size());
1464 char *const begin = &*result.begin();
1465 char *out = Append2(begin, a, b);
1466 GOOGLE_DCHECK_EQ(out, begin + result.size());
1467 return result;
1468 }
1469
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1470 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1471 string result;
1472 result.resize(a.size() + b.size() + c.size());
1473 char *const begin = &*result.begin();
1474 char *out = Append2(begin, a, b);
1475 out = Append1(out, c);
1476 GOOGLE_DCHECK_EQ(out, begin + result.size());
1477 return result;
1478 }
1479
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1480 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1481 const AlphaNum &d) {
1482 string result;
1483 result.resize(a.size() + b.size() + c.size() + d.size());
1484 char *const begin = &*result.begin();
1485 char *out = Append4(begin, a, b, c, d);
1486 GOOGLE_DCHECK_EQ(out, begin + result.size());
1487 return result;
1488 }
1489
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1490 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1491 const AlphaNum &d, const AlphaNum &e) {
1492 string result;
1493 result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1494 char *const begin = &*result.begin();
1495 char *out = Append4(begin, a, b, c, d);
1496 out = Append1(out, e);
1497 GOOGLE_DCHECK_EQ(out, begin + result.size());
1498 return result;
1499 }
1500
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1501 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1502 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1503 string result;
1504 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1505 f.size());
1506 char *const begin = &*result.begin();
1507 char *out = Append4(begin, a, b, c, d);
1508 out = Append2(out, e, f);
1509 GOOGLE_DCHECK_EQ(out, begin + result.size());
1510 return result;
1511 }
1512
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1513 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1514 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1515 const AlphaNum &g) {
1516 string result;
1517 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1518 f.size() + g.size());
1519 char *const begin = &*result.begin();
1520 char *out = Append4(begin, a, b, c, d);
1521 out = Append2(out, e, f);
1522 out = Append1(out, g);
1523 GOOGLE_DCHECK_EQ(out, begin + result.size());
1524 return result;
1525 }
1526
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1527 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1528 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1529 const AlphaNum &g, const AlphaNum &h) {
1530 string result;
1531 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1532 f.size() + g.size() + h.size());
1533 char *const begin = &*result.begin();
1534 char *out = Append4(begin, a, b, c, d);
1535 out = Append4(out, e, f, g, h);
1536 GOOGLE_DCHECK_EQ(out, begin + result.size());
1537 return result;
1538 }
1539
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1540 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1541 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1542 const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1543 string result;
1544 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1545 f.size() + g.size() + h.size() + i.size());
1546 char *const begin = &*result.begin();
1547 char *out = Append4(begin, a, b, c, d);
1548 out = Append4(out, e, f, g, h);
1549 out = Append1(out, i);
1550 GOOGLE_DCHECK_EQ(out, begin + result.size());
1551 return result;
1552 }
1553
1554 // It's possible to call StrAppend with a char * pointer that is partway into
1555 // the string we're appending to. However the results of this are random.
1556 // Therefore, check for this in debug mode. Use unsigned math so we only have
1557 // to do one comparison.
1558 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1559 GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1560 uintptr_t((dest).size()))
1561
StrAppend(string * result,const AlphaNum & a)1562 void StrAppend(string *result, const AlphaNum &a) {
1563 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1564 result->append(a.data(), a.size());
1565 }
1566
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b)1567 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1568 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1569 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1570 string::size_type old_size = result->size();
1571 result->resize(old_size + a.size() + b.size());
1572 char *const begin = &*result->begin();
1573 char *out = Append2(begin + old_size, a, b);
1574 GOOGLE_DCHECK_EQ(out, begin + result->size());
1575 }
1576
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1577 void StrAppend(string *result,
1578 const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1579 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1580 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1581 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1582 string::size_type old_size = result->size();
1583 result->resize(old_size + a.size() + b.size() + c.size());
1584 char *const begin = &*result->begin();
1585 char *out = Append2(begin + old_size, a, b);
1586 out = Append1(out, c);
1587 GOOGLE_DCHECK_EQ(out, begin + result->size());
1588 }
1589
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1590 void StrAppend(string *result,
1591 const AlphaNum &a, const AlphaNum &b,
1592 const AlphaNum &c, const AlphaNum &d) {
1593 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1594 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1595 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1596 GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1597 string::size_type old_size = result->size();
1598 result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1599 char *const begin = &*result->begin();
1600 char *out = Append4(begin + old_size, a, b, c, d);
1601 GOOGLE_DCHECK_EQ(out, begin + result->size());
1602 }
1603
GlobalReplaceSubstring(const string & substring,const string & replacement,string * s)1604 int GlobalReplaceSubstring(const string& substring,
1605 const string& replacement,
1606 string* s) {
1607 GOOGLE_CHECK(s != NULL);
1608 if (s->empty() || substring.empty())
1609 return 0;
1610 string tmp;
1611 int num_replacements = 0;
1612 int pos = 0;
1613 for (int match_pos = s->find(substring.data(), pos, substring.length());
1614 match_pos != string::npos;
1615 pos = match_pos + substring.length(),
1616 match_pos = s->find(substring.data(), pos, substring.length())) {
1617 ++num_replacements;
1618 // Append the original content before the match.
1619 tmp.append(*s, pos, match_pos - pos);
1620 // Append the replacement for the match.
1621 tmp.append(replacement.begin(), replacement.end());
1622 }
1623 // Append the content after the last match. If no replacements were made, the
1624 // original string is left untouched.
1625 if (num_replacements > 0) {
1626 tmp.append(*s, pos, s->length() - pos);
1627 s->swap(tmp);
1628 }
1629 return num_replacements;
1630 }
1631
CalculateBase64EscapedLen(int input_len,bool do_padding)1632 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1633 // Base64 encodes three bytes of input at a time. If the input is not
1634 // divisible by three, we pad as appropriate.
1635 //
1636 // (from http://tools.ietf.org/html/rfc3548)
1637 // Special processing is performed if fewer than 24 bits are available
1638 // at the end of the data being encoded. A full encoding quantum is
1639 // always completed at the end of a quantity. When fewer than 24 input
1640 // bits are available in an input group, zero bits are added (on the
1641 // right) to form an integral number of 6-bit groups. Padding at the
1642 // end of the data is performed using the '=' character. Since all base
1643 // 64 input is an integral number of octets, only the following cases
1644 // can arise:
1645
1646
1647 // Base64 encodes each three bytes of input into four bytes of output.
1648 int len = (input_len / 3) * 4;
1649
1650 if (input_len % 3 == 0) {
1651 // (from http://tools.ietf.org/html/rfc3548)
1652 // (1) the final quantum of encoding input is an integral multiple of 24
1653 // bits; here, the final unit of encoded output will be an integral
1654 // multiple of 4 characters with no "=" padding,
1655 } else if (input_len % 3 == 1) {
1656 // (from http://tools.ietf.org/html/rfc3548)
1657 // (2) the final quantum of encoding input is exactly 8 bits; here, the
1658 // final unit of encoded output will be two characters followed by two
1659 // "=" padding characters, or
1660 len += 2;
1661 if (do_padding) {
1662 len += 2;
1663 }
1664 } else { // (input_len % 3 == 2)
1665 // (from http://tools.ietf.org/html/rfc3548)
1666 // (3) the final quantum of encoding input is exactly 16 bits; here, the
1667 // final unit of encoded output will be three characters followed by one
1668 // "=" padding character.
1669 len += 3;
1670 if (do_padding) {
1671 len += 1;
1672 }
1673 }
1674
1675 assert(len >= input_len); // make sure we didn't overflow
1676 return len;
1677 }
1678
1679 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1680 int CalculateBase64EscapedLen(int input_len) {
1681 return CalculateBase64EscapedLen(input_len, true);
1682 }
1683
1684 // ----------------------------------------------------------------------
1685 // int Base64Unescape() - base64 decoder
1686 // int Base64Escape() - base64 encoder
1687 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1688 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1689 //
1690 // Check out
1691 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1692 // care about is that...
1693 // Take the encoded stuff in groups of 4 characters and turn each
1694 // character into a code 0 to 63 thus:
1695 // A-Z map to 0 to 25
1696 // a-z map to 26 to 51
1697 // 0-9 map to 52 to 61
1698 // +(- for WebSafe) maps to 62
1699 // /(_ for WebSafe) maps to 63
1700 // There will be four numbers, all less than 64 which can be represented
1701 // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1702 // Arrange the 6 digit binary numbers into three bytes as such:
1703 // aaaaaabb bbbbcccc ccdddddd
1704 // Equals signs (one or two) are used at the end of the encoded block to
1705 // indicate that the text was not an integer multiple of three bytes long.
1706 // ----------------------------------------------------------------------
1707
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1708 int Base64UnescapeInternal(const char *src_param, int szsrc,
1709 char *dest, int szdest,
1710 const signed char* unbase64) {
1711 static const char kPad64Equals = '=';
1712 static const char kPad64Dot = '.';
1713
1714 int decode = 0;
1715 int destidx = 0;
1716 int state = 0;
1717 unsigned int ch = 0;
1718 unsigned int temp = 0;
1719
1720 // If "char" is signed by default, using *src as an array index results in
1721 // accessing negative array elements. Treat the input as a pointer to
1722 // unsigned char to avoid this.
1723 const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1724
1725 // The GET_INPUT macro gets the next input character, skipping
1726 // over any whitespace, and stopping when we reach the end of the
1727 // string or when we read any non-data character. The arguments are
1728 // an arbitrary identifier (used as a label for goto) and the number
1729 // of data bytes that must remain in the input to avoid aborting the
1730 // loop.
1731 #define GET_INPUT(label, remain) \
1732 label: \
1733 --szsrc; \
1734 ch = *src++; \
1735 decode = unbase64[ch]; \
1736 if (decode < 0) { \
1737 if (ascii_isspace(ch) && szsrc >= remain) \
1738 goto label; \
1739 state = 4 - remain; \
1740 break; \
1741 }
1742
1743 // if dest is null, we're just checking to see if it's legal input
1744 // rather than producing output. (I suspect this could just be done
1745 // with a regexp...). We duplicate the loop so this test can be
1746 // outside it instead of in every iteration.
1747
1748 if (dest) {
1749 // This loop consumes 4 input bytes and produces 3 output bytes
1750 // per iteration. We can't know at the start that there is enough
1751 // data left in the string for a full iteration, so the loop may
1752 // break out in the middle; if so 'state' will be set to the
1753 // number of input bytes read.
1754
1755 while (szsrc >= 4) {
1756 // We'll start by optimistically assuming that the next four
1757 // bytes of the string (src[0..3]) are four good data bytes
1758 // (that is, no nulls, whitespace, padding chars, or illegal
1759 // chars). We need to test src[0..2] for nulls individually
1760 // before constructing temp to preserve the property that we
1761 // never read past a null in the string (no matter how long
1762 // szsrc claims the string is).
1763
1764 if (!src[0] || !src[1] || !src[2] ||
1765 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1766 (unsigned(unbase64[src[1]]) << 12) |
1767 (unsigned(unbase64[src[2]]) << 6) |
1768 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1769 // Iff any of those four characters was bad (null, illegal,
1770 // whitespace, padding), then temp's high bit will be set
1771 // (because unbase64[] is -1 for all bad characters).
1772 //
1773 // We'll back up and resort to the slower decoder, which knows
1774 // how to handle those cases.
1775
1776 GET_INPUT(first, 4);
1777 temp = decode;
1778 GET_INPUT(second, 3);
1779 temp = (temp << 6) | decode;
1780 GET_INPUT(third, 2);
1781 temp = (temp << 6) | decode;
1782 GET_INPUT(fourth, 1);
1783 temp = (temp << 6) | decode;
1784 } else {
1785 // We really did have four good data bytes, so advance four
1786 // characters in the string.
1787
1788 szsrc -= 4;
1789 src += 4;
1790 decode = -1;
1791 ch = '\0';
1792 }
1793
1794 // temp has 24 bits of input, so write that out as three bytes.
1795
1796 if (destidx+3 > szdest) return -1;
1797 dest[destidx+2] = temp;
1798 temp >>= 8;
1799 dest[destidx+1] = temp;
1800 temp >>= 8;
1801 dest[destidx] = temp;
1802 destidx += 3;
1803 }
1804 } else {
1805 while (szsrc >= 4) {
1806 if (!src[0] || !src[1] || !src[2] ||
1807 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1808 (unsigned(unbase64[src[1]]) << 12) |
1809 (unsigned(unbase64[src[2]]) << 6) |
1810 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1811 GET_INPUT(first_no_dest, 4);
1812 GET_INPUT(second_no_dest, 3);
1813 GET_INPUT(third_no_dest, 2);
1814 GET_INPUT(fourth_no_dest, 1);
1815 } else {
1816 szsrc -= 4;
1817 src += 4;
1818 decode = -1;
1819 ch = '\0';
1820 }
1821 destidx += 3;
1822 }
1823 }
1824
1825 #undef GET_INPUT
1826
1827 // if the loop terminated because we read a bad character, return
1828 // now.
1829 if (decode < 0 && ch != '\0' &&
1830 ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1831 return -1;
1832
1833 if (ch == kPad64Equals || ch == kPad64Dot) {
1834 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1835 // look at it again when we count to check for the proper number of
1836 // equals signs at the end.
1837 ++szsrc;
1838 --src;
1839 } else {
1840 // This loop consumes 1 input byte per iteration. It's used to
1841 // clean up the 0-3 input bytes remaining when the first, faster
1842 // loop finishes. 'temp' contains the data from 'state' input
1843 // characters read by the first loop.
1844 while (szsrc > 0) {
1845 --szsrc;
1846 ch = *src++;
1847 decode = unbase64[ch];
1848 if (decode < 0) {
1849 if (ascii_isspace(ch)) {
1850 continue;
1851 } else if (ch == '\0') {
1852 break;
1853 } else if (ch == kPad64Equals || ch == kPad64Dot) {
1854 // back up one character; we'll read it again when we check
1855 // for the correct number of pad characters at the end.
1856 ++szsrc;
1857 --src;
1858 break;
1859 } else {
1860 return -1;
1861 }
1862 }
1863
1864 // Each input character gives us six bits of output.
1865 temp = (temp << 6) | decode;
1866 ++state;
1867 if (state == 4) {
1868 // If we've accumulated 24 bits of output, write that out as
1869 // three bytes.
1870 if (dest) {
1871 if (destidx+3 > szdest) return -1;
1872 dest[destidx+2] = temp;
1873 temp >>= 8;
1874 dest[destidx+1] = temp;
1875 temp >>= 8;
1876 dest[destidx] = temp;
1877 }
1878 destidx += 3;
1879 state = 0;
1880 temp = 0;
1881 }
1882 }
1883 }
1884
1885 // Process the leftover data contained in 'temp' at the end of the input.
1886 int expected_equals = 0;
1887 switch (state) {
1888 case 0:
1889 // Nothing left over; output is a multiple of 3 bytes.
1890 break;
1891
1892 case 1:
1893 // Bad input; we have 6 bits left over.
1894 return -1;
1895
1896 case 2:
1897 // Produce one more output byte from the 12 input bits we have left.
1898 if (dest) {
1899 if (destidx+1 > szdest) return -1;
1900 temp >>= 4;
1901 dest[destidx] = temp;
1902 }
1903 ++destidx;
1904 expected_equals = 2;
1905 break;
1906
1907 case 3:
1908 // Produce two more output bytes from the 18 input bits we have left.
1909 if (dest) {
1910 if (destidx+2 > szdest) return -1;
1911 temp >>= 2;
1912 dest[destidx+1] = temp;
1913 temp >>= 8;
1914 dest[destidx] = temp;
1915 }
1916 destidx += 2;
1917 expected_equals = 1;
1918 break;
1919
1920 default:
1921 // state should have no other values at this point.
1922 GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1923 }
1924
1925 // The remainder of the string should be all whitespace, mixed with
1926 // exactly 0 equals signs, or exactly 'expected_equals' equals
1927 // signs. (Always accepting 0 equals signs is a google extension
1928 // not covered in the RFC, as is accepting dot as the pad character.)
1929
1930 int equals = 0;
1931 while (szsrc > 0 && *src) {
1932 if (*src == kPad64Equals || *src == kPad64Dot)
1933 ++equals;
1934 else if (!ascii_isspace(*src))
1935 return -1;
1936 --szsrc;
1937 ++src;
1938 }
1939
1940 return (equals == 0 || equals == expected_equals) ? destidx : -1;
1941 }
1942
1943 // The arrays below were generated by the following code
1944 // #include <sys/time.h>
1945 // #include <stdlib.h>
1946 // #include <string.h>
1947 // main()
1948 // {
1949 // static const char Base64[] =
1950 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1951 // char *pos;
1952 // int idx, i, j;
1953 // printf(" ");
1954 // for (i = 0; i < 255; i += 8) {
1955 // for (j = i; j < i + 8; j++) {
1956 // pos = strchr(Base64, j);
1957 // if ((pos == NULL) || (j == 0))
1958 // idx = -1;
1959 // else
1960 // idx = pos - Base64;
1961 // if (idx == -1)
1962 // printf(" %2d, ", idx);
1963 // else
1964 // printf(" %2d/*%c*/,", idx, j);
1965 // }
1966 // printf("\n ");
1967 // }
1968 // }
1969 //
1970 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1971 // tables from the functions below.
1972 static const signed char kUnBase64[] = {
1973 -1, -1, -1, -1, -1, -1, -1, -1,
1974 -1, -1, -1, -1, -1, -1, -1, -1,
1975 -1, -1, -1, -1, -1, -1, -1, -1,
1976 -1, -1, -1, -1, -1, -1, -1, -1,
1977 -1, -1, -1, -1, -1, -1, -1, -1,
1978 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
1979 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1980 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
1981 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
1982 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1983 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1984 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
1985 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
1986 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
1987 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
1988 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
1989 -1, -1, -1, -1, -1, -1, -1, -1,
1990 -1, -1, -1, -1, -1, -1, -1, -1,
1991 -1, -1, -1, -1, -1, -1, -1, -1,
1992 -1, -1, -1, -1, -1, -1, -1, -1,
1993 -1, -1, -1, -1, -1, -1, -1, -1,
1994 -1, -1, -1, -1, -1, -1, -1, -1,
1995 -1, -1, -1, -1, -1, -1, -1, -1,
1996 -1, -1, -1, -1, -1, -1, -1, -1,
1997 -1, -1, -1, -1, -1, -1, -1, -1,
1998 -1, -1, -1, -1, -1, -1, -1, -1,
1999 -1, -1, -1, -1, -1, -1, -1, -1,
2000 -1, -1, -1, -1, -1, -1, -1, -1,
2001 -1, -1, -1, -1, -1, -1, -1, -1,
2002 -1, -1, -1, -1, -1, -1, -1, -1,
2003 -1, -1, -1, -1, -1, -1, -1, -1,
2004 -1, -1, -1, -1, -1, -1, -1, -1
2005 };
2006 static const signed char kUnWebSafeBase64[] = {
2007 -1, -1, -1, -1, -1, -1, -1, -1,
2008 -1, -1, -1, -1, -1, -1, -1, -1,
2009 -1, -1, -1, -1, -1, -1, -1, -1,
2010 -1, -1, -1, -1, -1, -1, -1, -1,
2011 -1, -1, -1, -1, -1, -1, -1, -1,
2012 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
2013 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2014 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
2015 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
2016 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2017 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2018 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
2019 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2020 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2021 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2022 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
2023 -1, -1, -1, -1, -1, -1, -1, -1,
2024 -1, -1, -1, -1, -1, -1, -1, -1,
2025 -1, -1, -1, -1, -1, -1, -1, -1,
2026 -1, -1, -1, -1, -1, -1, -1, -1,
2027 -1, -1, -1, -1, -1, -1, -1, -1,
2028 -1, -1, -1, -1, -1, -1, -1, -1,
2029 -1, -1, -1, -1, -1, -1, -1, -1,
2030 -1, -1, -1, -1, -1, -1, -1, -1,
2031 -1, -1, -1, -1, -1, -1, -1, -1,
2032 -1, -1, -1, -1, -1, -1, -1, -1,
2033 -1, -1, -1, -1, -1, -1, -1, -1,
2034 -1, -1, -1, -1, -1, -1, -1, -1,
2035 -1, -1, -1, -1, -1, -1, -1, -1,
2036 -1, -1, -1, -1, -1, -1, -1, -1,
2037 -1, -1, -1, -1, -1, -1, -1, -1,
2038 -1, -1, -1, -1, -1, -1, -1, -1
2039 };
2040
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2041 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2042 return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2043 }
2044
Base64UnescapeInternal(const char * src,int slen,string * dest,const signed char * unbase64)2045 static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
2046 const signed char* unbase64) {
2047 // Determine the size of the output string. Base64 encodes every 3 bytes into
2048 // 4 characters. any leftover chars are added directly for good measure.
2049 // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2050 const int dest_len = 3 * (slen / 4) + (slen % 4);
2051
2052 dest->resize(dest_len);
2053
2054 // We are getting the destination buffer by getting the beginning of the
2055 // string and converting it into a char *.
2056 const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2057 dest_len, unbase64);
2058 if (len < 0) {
2059 dest->clear();
2060 return false;
2061 }
2062
2063 // could be shorter if there was padding
2064 GOOGLE_DCHECK_LE(len, dest_len);
2065 dest->erase(len);
2066
2067 return true;
2068 }
2069
Base64Unescape(StringPiece src,string * dest)2070 bool Base64Unescape(StringPiece src, string* dest) {
2071 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2072 }
2073
WebSafeBase64Unescape(StringPiece src,string * dest)2074 bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2075 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2076 }
2077
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2078 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2079 char *dest, int szdest, const char *base64,
2080 bool do_padding) {
2081 static const char kPad64 = '=';
2082
2083 if (szsrc <= 0) return 0;
2084
2085 if (szsrc * 4 > szdest * 3) return 0;
2086
2087 char *cur_dest = dest;
2088 const unsigned char *cur_src = src;
2089
2090 char *limit_dest = dest + szdest;
2091 const unsigned char *limit_src = src + szsrc;
2092
2093 // Three bytes of data encodes to four characters of cyphertext.
2094 // So we can pump through three-byte chunks atomically.
2095 while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits
2096 uint32 in = BigEndian::Load32(cur_src) >> 8;
2097
2098 cur_dest[0] = base64[in >> 18];
2099 in &= 0x3FFFF;
2100 cur_dest[1] = base64[in >> 12];
2101 in &= 0xFFF;
2102 cur_dest[2] = base64[in >> 6];
2103 in &= 0x3F;
2104 cur_dest[3] = base64[in];
2105
2106 cur_dest += 4;
2107 cur_src += 3;
2108 }
2109 // To save time, we didn't update szdest or szsrc in the loop. So do it now.
2110 szdest = limit_dest - cur_dest;
2111 szsrc = limit_src - cur_src;
2112
2113 /* now deal with the tail (<=3 bytes) */
2114 switch (szsrc) {
2115 case 0:
2116 // Nothing left; nothing more to do.
2117 break;
2118 case 1: {
2119 // One byte left: this encodes to two characters, and (optionally)
2120 // two pad characters to round out the four-character cypherblock.
2121 if ((szdest -= 2) < 0) return 0;
2122 uint32 in = cur_src[0];
2123 cur_dest[0] = base64[in >> 2];
2124 in &= 0x3;
2125 cur_dest[1] = base64[in << 4];
2126 cur_dest += 2;
2127 if (do_padding) {
2128 if ((szdest -= 2) < 0) return 0;
2129 cur_dest[0] = kPad64;
2130 cur_dest[1] = kPad64;
2131 cur_dest += 2;
2132 }
2133 break;
2134 }
2135 case 2: {
2136 // Two bytes left: this encodes to three characters, and (optionally)
2137 // one pad character to round out the four-character cypherblock.
2138 if ((szdest -= 3) < 0) return 0;
2139 uint32 in = BigEndian::Load16(cur_src);
2140 cur_dest[0] = base64[in >> 10];
2141 in &= 0x3FF;
2142 cur_dest[1] = base64[in >> 4];
2143 in &= 0x00F;
2144 cur_dest[2] = base64[in << 2];
2145 cur_dest += 3;
2146 if (do_padding) {
2147 if ((szdest -= 1) < 0) return 0;
2148 cur_dest[0] = kPad64;
2149 cur_dest += 1;
2150 }
2151 break;
2152 }
2153 case 3: {
2154 // Three bytes left: same as in the big loop above. We can't do this in
2155 // the loop because the loop above always reads 4 bytes, and the fourth
2156 // byte is past the end of the input.
2157 if ((szdest -= 4) < 0) return 0;
2158 uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2159 cur_dest[0] = base64[in >> 18];
2160 in &= 0x3FFFF;
2161 cur_dest[1] = base64[in >> 12];
2162 in &= 0xFFF;
2163 cur_dest[2] = base64[in >> 6];
2164 in &= 0x3F;
2165 cur_dest[3] = base64[in];
2166 cur_dest += 4;
2167 break;
2168 }
2169 default:
2170 // Should not be reached: blocks of 4 bytes are handled
2171 // in the while loop before this switch statement.
2172 GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2173 break;
2174 }
2175 return (cur_dest - dest);
2176 }
2177
2178 static const char kBase64Chars[] =
2179 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2180
2181 static const char kWebSafeBase64Chars[] =
2182 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2183
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2184 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2185 return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2186 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2187 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2188 int szdest, bool do_padding) {
2189 return Base64EscapeInternal(src, szsrc, dest, szdest,
2190 kWebSafeBase64Chars, do_padding);
2191 }
2192
Base64EscapeInternal(const unsigned char * src,int szsrc,string * dest,bool do_padding,const char * base64_chars)2193 void Base64EscapeInternal(const unsigned char* src, int szsrc,
2194 string* dest, bool do_padding,
2195 const char* base64_chars) {
2196 const int calc_escaped_size =
2197 CalculateBase64EscapedLen(szsrc, do_padding);
2198 dest->resize(calc_escaped_size);
2199 const int escaped_len = Base64EscapeInternal(src, szsrc,
2200 string_as_array(dest),
2201 dest->size(),
2202 base64_chars,
2203 do_padding);
2204 GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2205 dest->erase(escaped_len);
2206 }
2207
Base64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2208 void Base64Escape(const unsigned char *src, int szsrc,
2209 string* dest, bool do_padding) {
2210 Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2211 }
2212
WebSafeBase64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2213 void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2214 string *dest, bool do_padding) {
2215 Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2216 }
2217
Base64Escape(StringPiece src,string * dest)2218 void Base64Escape(StringPiece src, string* dest) {
2219 Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2220 src.size(), dest, true);
2221 }
2222
WebSafeBase64Escape(StringPiece src,string * dest)2223 void WebSafeBase64Escape(StringPiece src, string* dest) {
2224 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2225 src.size(), dest, false);
2226 }
2227
WebSafeBase64EscapeWithPadding(StringPiece src,string * dest)2228 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2229 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2230 src.size(), dest, true);
2231 }
2232
2233 // Helper to append a Unicode code point to a string as UTF8, without bringing
2234 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2235 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2236 uint32 tmp = 0;
2237 int len = 0;
2238 if (code_point <= 0x7f) {
2239 tmp = code_point;
2240 len = 1;
2241 } else if (code_point <= 0x07ff) {
2242 tmp = 0x0000c080 |
2243 ((code_point & 0x07c0) << 2) |
2244 (code_point & 0x003f);
2245 len = 2;
2246 } else if (code_point <= 0xffff) {
2247 tmp = 0x00e08080 |
2248 ((code_point & 0xf000) << 4) |
2249 ((code_point & 0x0fc0) << 2) |
2250 (code_point & 0x003f);
2251 len = 3;
2252 } else {
2253 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2254 // normally only defined up to there as well.
2255 tmp = 0xf0808080 |
2256 ((code_point & 0x1c0000) << 6) |
2257 ((code_point & 0x03f000) << 4) |
2258 ((code_point & 0x000fc0) << 2) |
2259 (code_point & 0x003f);
2260 len = 4;
2261 }
2262 tmp = ghtonl(tmp);
2263 memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2264 return len;
2265 }
2266
2267 // Table of UTF-8 character lengths, based on first byte
2268 static const unsigned char kUTF8LenTbl[256] = {
2269 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2270 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2271 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2272 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2273
2274 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2275 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2276 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2277 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
2278 };
2279
2280 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2281 int UTF8FirstLetterNumBytes(const char* src, int len) {
2282 if (len == 0) {
2283 return 0;
2284 }
2285 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2286 }
2287
2288 } // namespace protobuf
2289 } // namespace google
2290