1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 // from google3/strings/strutil.cc
32
33 #include <google/protobuf/stubs/strutil.h>
34
35 #include <errno.h>
36 #include <float.h> // FLT_DIG and DBL_DIG
37 #include <limits.h>
38 #include <stdio.h>
39 #include <cmath>
40 #include <iterator>
41 #include <limits>
42
43 #include <google/protobuf/stubs/logging.h>
44 #include <google/protobuf/stubs/stl_util.h>
45
46 #ifdef _WIN32
47 // MSVC has only _snprintf, not snprintf.
48 //
49 // MinGW has both snprintf and _snprintf, but they appear to be different
50 // functions. The former is buggy. When invoked like so:
51 // char buffer[32];
52 // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
53 // it prints "1.23000e+10". This is plainly wrong: %g should never print
54 // trailing zeros after the decimal point. For some reason this bug only
55 // occurs with some input values, not all. In any case, _snprintf does the
56 // right thing, so we use it.
57 #define snprintf _snprintf
58 #endif
59
60 namespace google {
61 namespace protobuf {
62
63 // These are defined as macros on some platforms. #undef them so that we can
64 // redefine them.
65 #undef isxdigit
66 #undef isprint
67
68 // The definitions of these in ctype.h change based on locale. Since our
69 // string manipulation is all in relation to the protocol buffer and C++
70 // languages, we always want to use the C locale. So, we re-define these
71 // exactly as we want them.
isxdigit(char c)72 inline bool isxdigit(char c) {
73 return ('0' <= c && c <= '9') ||
74 ('a' <= c && c <= 'f') ||
75 ('A' <= c && c <= 'F');
76 }
77
isprint(char c)78 inline bool isprint(char c) {
79 return c >= 0x20 && c <= 0x7E;
80 }
81
82 // ----------------------------------------------------------------------
83 // ReplaceCharacters
84 // Replaces any occurrence of the character 'remove' (or the characters
85 // in 'remove') with the character 'replacewith'.
86 // ----------------------------------------------------------------------
ReplaceCharacters(string * s,const char * remove,char replacewith)87 void ReplaceCharacters(string *s, const char *remove, char replacewith) {
88 const char *str_start = s->c_str();
89 const char *str = str_start;
90 for (str = strpbrk(str, remove);
91 str != nullptr;
92 str = strpbrk(str + 1, remove)) {
93 (*s)[str - str_start] = replacewith;
94 }
95 }
96
StripWhitespace(string * str)97 void StripWhitespace(string* str) {
98 int str_length = str->length();
99
100 // Strip off leading whitespace.
101 int first = 0;
102 while (first < str_length && ascii_isspace(str->at(first))) {
103 ++first;
104 }
105 // If entire string is white space.
106 if (first == str_length) {
107 str->clear();
108 return;
109 }
110 if (first > 0) {
111 str->erase(0, first);
112 str_length -= first;
113 }
114
115 // Strip off trailing whitespace.
116 int last = str_length - 1;
117 while (last >= 0 && ascii_isspace(str->at(last))) {
118 --last;
119 }
120 if (last != (str_length - 1) && last >= 0) {
121 str->erase(last + 1, string::npos);
122 }
123 }
124
125 // ----------------------------------------------------------------------
126 // StringReplace()
127 // Replace the "old" pattern with the "new" pattern in a string,
128 // and append the result to "res". If replace_all is false,
129 // it only replaces the first instance of "old."
130 // ----------------------------------------------------------------------
131
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all,string * res)132 void StringReplace(const string& s, const string& oldsub,
133 const string& newsub, bool replace_all,
134 string* res) {
135 if (oldsub.empty()) {
136 res->append(s); // if empty, append the given string.
137 return;
138 }
139
140 string::size_type start_pos = 0;
141 string::size_type pos;
142 do {
143 pos = s.find(oldsub, start_pos);
144 if (pos == string::npos) {
145 break;
146 }
147 res->append(s, start_pos, pos - start_pos);
148 res->append(newsub);
149 start_pos = pos + oldsub.size(); // start searching again after the "old"
150 } while (replace_all);
151 res->append(s, start_pos, s.length() - start_pos);
152 }
153
154 // ----------------------------------------------------------------------
155 // StringReplace()
156 // Give me a string and two patterns "old" and "new", and I replace
157 // the first instance of "old" in the string with "new", if it
158 // exists. If "global" is true; call this repeatedly until it
159 // fails. RETURN a new string, regardless of whether the replacement
160 // happened or not.
161 // ----------------------------------------------------------------------
162
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all)163 string StringReplace(const string& s, const string& oldsub,
164 const string& newsub, bool replace_all) {
165 string ret;
166 StringReplace(s, oldsub, newsub, replace_all, &ret);
167 return ret;
168 }
169
170 // ----------------------------------------------------------------------
171 // SplitStringUsing()
172 // Split a string using a character delimiter. Append the components
173 // to 'result'.
174 //
175 // Note: For multi-character delimiters, this routine will split on *ANY* of
176 // the characters in the string, not the entire string as a single delimiter.
177 // ----------------------------------------------------------------------
178 template <typename ITR>
SplitStringToIteratorUsing(StringPiece full,const char * delim,ITR & result)179 static inline void SplitStringToIteratorUsing(StringPiece full,
180 const char *delim, ITR &result) {
181 // Optimize the common case where delim is a single character.
182 if (delim[0] != '\0' && delim[1] == '\0') {
183 char c = delim[0];
184 const char* p = full.data();
185 const char* end = p + full.size();
186 while (p != end) {
187 if (*p == c) {
188 ++p;
189 } else {
190 const char* start = p;
191 while (++p != end && *p != c);
192 *result++ = std::string(start, p - start);
193 }
194 }
195 return;
196 }
197
198 string::size_type begin_index, end_index;
199 begin_index = full.find_first_not_of(delim);
200 while (begin_index != string::npos) {
201 end_index = full.find_first_of(delim, begin_index);
202 if (end_index == string::npos) {
203 *result++ = std::string(full.substr(begin_index));
204 return;
205 }
206 *result++ =
207 std::string(full.substr(begin_index, (end_index - begin_index)));
208 begin_index = full.find_first_not_of(delim, end_index);
209 }
210 }
211
SplitStringUsing(StringPiece full,const char * delim,std::vector<string> * result)212 void SplitStringUsing(StringPiece full, const char *delim,
213 std::vector<string> *result) {
214 std::back_insert_iterator< std::vector<string> > it(*result);
215 SplitStringToIteratorUsing(full, delim, it);
216 }
217
218 // Split a string using a character delimiter. Append the components
219 // to 'result'. If there are consecutive delimiters, this function
220 // will return corresponding empty strings. The string is split into
221 // at most the specified number of pieces greedily. This means that the
222 // last piece may possibly be split further. To split into as many pieces
223 // as possible, specify 0 as the number of pieces.
224 //
225 // If "full" is the empty string, yields an empty string as the only value.
226 //
227 // If "pieces" is negative for some reason, it returns the whole string
228 // ----------------------------------------------------------------------
229 template <typename ITR>
SplitStringToIteratorAllowEmpty(StringPiece full,const char * delim,int pieces,ITR & result)230 static inline void SplitStringToIteratorAllowEmpty(StringPiece full,
231 const char *delim,
232 int pieces, ITR &result) {
233 string::size_type begin_index, end_index;
234 begin_index = 0;
235
236 for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
237 end_index = full.find_first_of(delim, begin_index);
238 if (end_index == string::npos) {
239 *result++ = std::string(full.substr(begin_index));
240 return;
241 }
242 *result++ =
243 std::string(full.substr(begin_index, (end_index - begin_index)));
244 begin_index = end_index + 1;
245 }
246 *result++ = std::string(full.substr(begin_index));
247 }
248
SplitStringAllowEmpty(StringPiece full,const char * delim,std::vector<string> * result)249 void SplitStringAllowEmpty(StringPiece full, const char *delim,
250 std::vector<string> *result) {
251 std::back_insert_iterator<std::vector<string> > it(*result);
252 SplitStringToIteratorAllowEmpty(full, delim, 0, it);
253 }
254
255 // ----------------------------------------------------------------------
256 // JoinStrings()
257 // This merges a vector of string components with delim inserted
258 // as separaters between components.
259 //
260 // ----------------------------------------------------------------------
261 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,string * result)262 static void JoinStringsIterator(const ITERATOR& start,
263 const ITERATOR& end,
264 const char* delim,
265 string* result) {
266 GOOGLE_CHECK(result != nullptr);
267 result->clear();
268 int delim_length = strlen(delim);
269
270 // Precompute resulting length so we can reserve() memory in one shot.
271 int length = 0;
272 for (ITERATOR iter = start; iter != end; ++iter) {
273 if (iter != start) {
274 length += delim_length;
275 }
276 length += iter->size();
277 }
278 result->reserve(length);
279
280 // Now combine everything.
281 for (ITERATOR iter = start; iter != end; ++iter) {
282 if (iter != start) {
283 result->append(delim, delim_length);
284 }
285 result->append(iter->data(), iter->size());
286 }
287 }
288
JoinStrings(const std::vector<string> & components,const char * delim,string * result)289 void JoinStrings(const std::vector<string>& components,
290 const char* delim,
291 string * result) {
292 JoinStringsIterator(components.begin(), components.end(), delim, result);
293 }
294
295 // ----------------------------------------------------------------------
296 // UnescapeCEscapeSequences()
297 // This does all the unescaping that C does: \ooo, \r, \n, etc
298 // Returns length of resulting string.
299 // The implementation of \x parses any positive number of hex digits,
300 // but it is an error if the value requires more than 8 bits, and the
301 // result is truncated to 8 bits.
302 //
303 // The second call stores its errors in a supplied string vector.
304 // If the string vector pointer is nullptr, it reports the errors with LOG().
305 // ----------------------------------------------------------------------
306
307 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
308
309 // Protocol buffers doesn't ever care about errors, but I don't want to remove
310 // the code.
311 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
312
UnescapeCEscapeSequences(const char * source,char * dest)313 int UnescapeCEscapeSequences(const char* source, char* dest) {
314 return UnescapeCEscapeSequences(source, dest, nullptr);
315 }
316
UnescapeCEscapeSequences(const char * source,char * dest,std::vector<string> * errors)317 int UnescapeCEscapeSequences(const char* source, char* dest,
318 std::vector<string> *errors) {
319 GOOGLE_DCHECK(errors == nullptr) << "Error reporting not implemented.";
320
321 char* d = dest;
322 const char* p = source;
323
324 // Small optimization for case where source = dest and there's no escaping
325 while ( p == d && *p != '\0' && *p != '\\' )
326 p++, d++;
327
328 while (*p != '\0') {
329 if (*p != '\\') {
330 *d++ = *p++;
331 } else {
332 switch ( *++p ) { // skip past the '\\'
333 case '\0':
334 LOG_STRING(ERROR, errors) << "String cannot end with \\";
335 *d = '\0';
336 return d - dest; // we're done with p
337 case 'a': *d++ = '\a'; break;
338 case 'b': *d++ = '\b'; break;
339 case 'f': *d++ = '\f'; break;
340 case 'n': *d++ = '\n'; break;
341 case 'r': *d++ = '\r'; break;
342 case 't': *d++ = '\t'; break;
343 case 'v': *d++ = '\v'; break;
344 case '\\': *d++ = '\\'; break;
345 case '?': *d++ = '\?'; break; // \? Who knew?
346 case '\'': *d++ = '\''; break;
347 case '"': *d++ = '\"'; break;
348 case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits
349 case '4': case '5': case '6': case '7': {
350 char ch = *p - '0';
351 if ( IS_OCTAL_DIGIT(p[1]) )
352 ch = ch * 8 + *++p - '0';
353 if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice
354 ch = ch * 8 + *++p - '0'; // now points at last digit
355 *d++ = ch;
356 break;
357 }
358 case 'x': case 'X': {
359 if (!isxdigit(p[1])) {
360 if (p[1] == '\0') {
361 LOG_STRING(ERROR, errors) << "String cannot end with \\x";
362 } else {
363 LOG_STRING(ERROR, errors) <<
364 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
365 }
366 break;
367 }
368 unsigned int ch = 0;
369 const char *hex_start = p;
370 while (isxdigit(p[1])) // arbitrarily many hex digits
371 ch = (ch << 4) + hex_digit_to_int(*++p);
372 if (ch > 0xFF)
373 LOG_STRING(ERROR, errors) << "Value of " <<
374 "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
375 *d++ = ch;
376 break;
377 }
378 #if 0 // TODO(kenton): Support \u and \U? Requires runetochar().
379 case 'u': {
380 // \uhhhh => convert 4 hex digits to UTF-8
381 char32 rune = 0;
382 const char *hex_start = p;
383 for (int i = 0; i < 4; ++i) {
384 if (isxdigit(p[1])) { // Look one char ahead.
385 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
386 } else {
387 LOG_STRING(ERROR, errors)
388 << "\\u must be followed by 4 hex digits: \\"
389 << string(hex_start, p+1-hex_start);
390 break;
391 }
392 }
393 d += runetochar(d, &rune);
394 break;
395 }
396 case 'U': {
397 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
398 char32 rune = 0;
399 const char *hex_start = p;
400 for (int i = 0; i < 8; ++i) {
401 if (isxdigit(p[1])) { // Look one char ahead.
402 // Don't change rune until we're sure this
403 // is within the Unicode limit, but do advance p.
404 char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
405 if (newrune > 0x10FFFF) {
406 LOG_STRING(ERROR, errors)
407 << "Value of \\"
408 << string(hex_start, p + 1 - hex_start)
409 << " exceeds Unicode limit (0x10FFFF)";
410 break;
411 } else {
412 rune = newrune;
413 }
414 } else {
415 LOG_STRING(ERROR, errors)
416 << "\\U must be followed by 8 hex digits: \\"
417 << string(hex_start, p+1-hex_start);
418 break;
419 }
420 }
421 d += runetochar(d, &rune);
422 break;
423 }
424 #endif
425 default:
426 LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
427 }
428 p++; // read past letter we escaped
429 }
430 }
431 *d = '\0';
432 return d - dest;
433 }
434
435 // ----------------------------------------------------------------------
436 // UnescapeCEscapeString()
437 // This does the same thing as UnescapeCEscapeSequences, but creates
438 // a new string. The caller does not need to worry about allocating
439 // a dest buffer. This should be used for non performance critical
440 // tasks such as printing debug messages. It is safe for src and dest
441 // to be the same.
442 //
443 // The second call stores its errors in a supplied string vector.
444 // If the string vector pointer is nullptr, it reports the errors with LOG().
445 //
446 // In the first and second calls, the length of dest is returned. In the
447 // the third call, the new string is returned.
448 // ----------------------------------------------------------------------
UnescapeCEscapeString(const string & src,string * dest)449 int UnescapeCEscapeString(const string& src, string* dest) {
450 return UnescapeCEscapeString(src, dest, nullptr);
451 }
452
UnescapeCEscapeString(const string & src,string * dest,std::vector<string> * errors)453 int UnescapeCEscapeString(const string& src, string* dest,
454 std::vector<string> *errors) {
455 std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
456 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
457 GOOGLE_CHECK(dest);
458 dest->assign(unescaped.get(), len);
459 return len;
460 }
461
UnescapeCEscapeString(const string & src)462 string UnescapeCEscapeString(const string& src) {
463 std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
464 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
465 return string(unescaped.get(), len);
466 }
467
468 // ----------------------------------------------------------------------
469 // CEscapeString()
470 // CHexEscapeString()
471 // Copies 'src' to 'dest', escaping dangerous characters using
472 // C-style escape sequences. This is very useful for preparing query
473 // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
474 // hexadecimal rather than octal sequences.
475 // Returns the number of bytes written to 'dest' (not including the \0)
476 // or -1 if there was insufficient space.
477 //
478 // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
479 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)480 int CEscapeInternal(const char* src, int src_len, char* dest,
481 int dest_len, bool use_hex, bool utf8_safe) {
482 const char* src_end = src + src_len;
483 int used = 0;
484 bool last_hex_escape = false; // true if last output char was \xNN
485
486 for (; src < src_end; src++) {
487 if (dest_len - used < 2) // Need space for two letter escape
488 return -1;
489
490 bool is_hex_escape = false;
491 switch (*src) {
492 case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
493 case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
494 case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
495 case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
496 case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
497 case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
498 default:
499 // Note that if we emit \xNN and the src character after that is a hex
500 // digit then that digit must be escaped too to prevent it being
501 // interpreted as part of the character code by C.
502 if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
503 (!isprint(*src) ||
504 (last_hex_escape && isxdigit(*src)))) {
505 if (dest_len - used < 4) // need space for 4 letter escape
506 return -1;
507 sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
508 static_cast<uint8>(*src));
509 is_hex_escape = use_hex;
510 used += 4;
511 } else {
512 dest[used++] = *src; break;
513 }
514 }
515 last_hex_escape = is_hex_escape;
516 }
517
518 if (dest_len - used < 1) // make sure that there is room for \0
519 return -1;
520
521 dest[used] = '\0'; // doesn't count towards return value though
522 return used;
523 }
524
525 // Calculates the length of the C-style escaped version of 'src'.
526 // Assumes that non-printable characters are escaped using octal sequences, and
527 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)528 static inline size_t CEscapedLength(StringPiece src) {
529 static char c_escaped_len[256] = {
530 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
531 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
532 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
538 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
539 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
540 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
541 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
542 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
543 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
544 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
545 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
546 };
547
548 size_t escaped_len = 0;
549 for (int i = 0; i < src.size(); ++i) {
550 unsigned char c = static_cast<unsigned char>(src[i]);
551 escaped_len += c_escaped_len[c];
552 }
553 return escaped_len;
554 }
555
556 // ----------------------------------------------------------------------
557 // Escapes 'src' using C-style escape sequences, and appends the escaped string
558 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
559 // the required space using a lookup table, and also does not do any special
560 // handling for Hex or UTF-8 characters.
561 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,string * dest)562 void CEscapeAndAppend(StringPiece src, string* dest) {
563 size_t escaped_len = CEscapedLength(src);
564 if (escaped_len == src.size()) {
565 dest->append(src.data(), src.size());
566 return;
567 }
568
569 size_t cur_dest_len = dest->size();
570 dest->resize(cur_dest_len + escaped_len);
571 char* append_ptr = &(*dest)[cur_dest_len];
572
573 for (int i = 0; i < src.size(); ++i) {
574 unsigned char c = static_cast<unsigned char>(src[i]);
575 switch (c) {
576 case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
577 case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
578 case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
579 case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
580 case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
581 case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
582 default:
583 if (!isprint(c)) {
584 *append_ptr++ = '\\';
585 *append_ptr++ = '0' + c / 64;
586 *append_ptr++ = '0' + (c % 64) / 8;
587 *append_ptr++ = '0' + c % 8;
588 } else {
589 *append_ptr++ = c;
590 }
591 break;
592 }
593 }
594 }
595
CEscape(const string & src)596 string CEscape(const string& src) {
597 string dest;
598 CEscapeAndAppend(src, &dest);
599 return dest;
600 }
601
602 namespace strings {
603
Utf8SafeCEscape(const string & src)604 string Utf8SafeCEscape(const string& src) {
605 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
606 std::unique_ptr<char[]> dest(new char[dest_length]);
607 const int len = CEscapeInternal(src.data(), src.size(),
608 dest.get(), dest_length, false, true);
609 GOOGLE_DCHECK_GE(len, 0);
610 return string(dest.get(), len);
611 }
612
CHexEscape(const string & src)613 string CHexEscape(const string& src) {
614 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
615 std::unique_ptr<char[]> dest(new char[dest_length]);
616 const int len = CEscapeInternal(src.data(), src.size(),
617 dest.get(), dest_length, true, false);
618 GOOGLE_DCHECK_GE(len, 0);
619 return string(dest.get(), len);
620 }
621
622 } // namespace strings
623
624 // ----------------------------------------------------------------------
625 // strto32_adaptor()
626 // strtou32_adaptor()
627 // Implementation of strto[u]l replacements that have identical
628 // overflow and underflow characteristics for both ILP-32 and LP-64
629 // platforms, including errno preservation in error-free calls.
630 // ----------------------------------------------------------------------
631
strto32_adaptor(const char * nptr,char ** endptr,int base)632 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
633 const int saved_errno = errno;
634 errno = 0;
635 const long result = strtol(nptr, endptr, base);
636 if (errno == ERANGE && result == LONG_MIN) {
637 return kint32min;
638 } else if (errno == ERANGE && result == LONG_MAX) {
639 return kint32max;
640 } else if (errno == 0 && result < kint32min) {
641 errno = ERANGE;
642 return kint32min;
643 } else if (errno == 0 && result > kint32max) {
644 errno = ERANGE;
645 return kint32max;
646 }
647 if (errno == 0)
648 errno = saved_errno;
649 return static_cast<int32>(result);
650 }
651
strtou32_adaptor(const char * nptr,char ** endptr,int base)652 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
653 const int saved_errno = errno;
654 errno = 0;
655 const unsigned long result = strtoul(nptr, endptr, base);
656 if (errno == ERANGE && result == ULONG_MAX) {
657 return kuint32max;
658 } else if (errno == 0 && result > kuint32max) {
659 errno = ERANGE;
660 return kuint32max;
661 }
662 if (errno == 0)
663 errno = saved_errno;
664 return static_cast<uint32>(result);
665 }
666
safe_parse_sign(string * text,bool * negative_ptr)667 inline bool safe_parse_sign(string* text /*inout*/,
668 bool* negative_ptr /*output*/) {
669 const char* start = text->data();
670 const char* end = start + text->size();
671
672 // Consume whitespace.
673 while (start < end && (start[0] == ' ')) {
674 ++start;
675 }
676 while (start < end && (end[-1] == ' ')) {
677 --end;
678 }
679 if (start >= end) {
680 return false;
681 }
682
683 // Consume sign.
684 *negative_ptr = (start[0] == '-');
685 if (*negative_ptr || start[0] == '+') {
686 ++start;
687 if (start >= end) {
688 return false;
689 }
690 }
691 *text = text->substr(start - text->data(), end - start);
692 return true;
693 }
694
695 template<typename IntType>
safe_parse_positive_int(string text,IntType * value_p)696 bool safe_parse_positive_int(
697 string text, IntType* value_p) {
698 int base = 10;
699 IntType value = 0;
700 const IntType vmax = std::numeric_limits<IntType>::max();
701 assert(vmax > 0);
702 assert(vmax >= base);
703 const IntType vmax_over_base = vmax / base;
704 const char* start = text.data();
705 const char* end = start + text.size();
706 // loop over digits
707 for (; start < end; ++start) {
708 unsigned char c = static_cast<unsigned char>(start[0]);
709 int digit = c - '0';
710 if (digit >= base || digit < 0) {
711 *value_p = value;
712 return false;
713 }
714 if (value > vmax_over_base) {
715 *value_p = vmax;
716 return false;
717 }
718 value *= base;
719 if (value > vmax - digit) {
720 *value_p = vmax;
721 return false;
722 }
723 value += digit;
724 }
725 *value_p = value;
726 return true;
727 }
728
729 template<typename IntType>
safe_parse_negative_int(const string & text,IntType * value_p)730 bool safe_parse_negative_int(
731 const string& text, IntType* value_p) {
732 int base = 10;
733 IntType value = 0;
734 const IntType vmin = std::numeric_limits<IntType>::min();
735 assert(vmin < 0);
736 assert(vmin <= 0 - base);
737 IntType vmin_over_base = vmin / base;
738 // 2003 c++ standard [expr.mul]
739 // "... the sign of the remainder is implementation-defined."
740 // Although (vmin/base)*base + vmin%base is always vmin.
741 // 2011 c++ standard tightens the spec but we cannot rely on it.
742 if (vmin % base > 0) {
743 vmin_over_base += 1;
744 }
745 const char* start = text.data();
746 const char* end = start + text.size();
747 // loop over digits
748 for (; start < end; ++start) {
749 unsigned char c = static_cast<unsigned char>(start[0]);
750 int digit = c - '0';
751 if (digit >= base || digit < 0) {
752 *value_p = value;
753 return false;
754 }
755 if (value < vmin_over_base) {
756 *value_p = vmin;
757 return false;
758 }
759 value *= base;
760 if (value < vmin + digit) {
761 *value_p = vmin;
762 return false;
763 }
764 value -= digit;
765 }
766 *value_p = value;
767 return true;
768 }
769
770 template<typename IntType>
safe_int_internal(string text,IntType * value_p)771 bool safe_int_internal(string text, IntType* value_p) {
772 *value_p = 0;
773 bool negative;
774 if (!safe_parse_sign(&text, &negative)) {
775 return false;
776 }
777 if (!negative) {
778 return safe_parse_positive_int(text, value_p);
779 } else {
780 return safe_parse_negative_int(text, value_p);
781 }
782 }
783
784 template<typename IntType>
safe_uint_internal(string text,IntType * value_p)785 bool safe_uint_internal(string text, IntType* value_p) {
786 *value_p = 0;
787 bool negative;
788 if (!safe_parse_sign(&text, &negative) || negative) {
789 return false;
790 }
791 return safe_parse_positive_int(text, value_p);
792 }
793
794 // ----------------------------------------------------------------------
795 // FastIntToBuffer()
796 // FastInt64ToBuffer()
797 // FastHexToBuffer()
798 // FastHex64ToBuffer()
799 // FastHex32ToBuffer()
800 // ----------------------------------------------------------------------
801
802 // Offset into buffer where FastInt64ToBuffer places the end of string
803 // null character. Also used by FastInt64ToBufferLeft.
804 static const int kFastInt64ToBufferOffset = 21;
805
FastInt64ToBuffer(int64 i,char * buffer)806 char *FastInt64ToBuffer(int64 i, char* buffer) {
807 // We could collapse the positive and negative sections, but that
808 // would be slightly slower for positive numbers...
809 // 22 bytes is enough to store -2**64, -18446744073709551616.
810 char* p = buffer + kFastInt64ToBufferOffset;
811 *p-- = '\0';
812 if (i >= 0) {
813 do {
814 *p-- = '0' + i % 10;
815 i /= 10;
816 } while (i > 0);
817 return p + 1;
818 } else {
819 // On different platforms, % and / have different behaviors for
820 // negative numbers, so we need to jump through hoops to make sure
821 // we don't divide negative numbers.
822 if (i > -10) {
823 i = -i;
824 *p-- = '0' + i;
825 *p = '-';
826 return p;
827 } else {
828 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
829 i = i + 10;
830 i = -i;
831 *p-- = '0' + i % 10;
832 // Undo what we did a moment ago
833 i = i / 10 + 1;
834 do {
835 *p-- = '0' + i % 10;
836 i /= 10;
837 } while (i > 0);
838 *p = '-';
839 return p;
840 }
841 }
842 }
843
844 // Offset into buffer where FastInt32ToBuffer places the end of string
845 // null character. Also used by FastInt32ToBufferLeft
846 static const int kFastInt32ToBufferOffset = 11;
847
848 // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the
849 // compiler to generate 32 bit arithmetic instructions. It's much faster, at
850 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)851 char *FastInt32ToBuffer(int32 i, char* buffer) {
852 // We could collapse the positive and negative sections, but that
853 // would be slightly slower for positive numbers...
854 // 12 bytes is enough to store -2**32, -4294967296.
855 char* p = buffer + kFastInt32ToBufferOffset;
856 *p-- = '\0';
857 if (i >= 0) {
858 do {
859 *p-- = '0' + i % 10;
860 i /= 10;
861 } while (i > 0);
862 return p + 1;
863 } else {
864 // On different platforms, % and / have different behaviors for
865 // negative numbers, so we need to jump through hoops to make sure
866 // we don't divide negative numbers.
867 if (i > -10) {
868 i = -i;
869 *p-- = '0' + i;
870 *p = '-';
871 return p;
872 } else {
873 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
874 i = i + 10;
875 i = -i;
876 *p-- = '0' + i % 10;
877 // Undo what we did a moment ago
878 i = i / 10 + 1;
879 do {
880 *p-- = '0' + i % 10;
881 i /= 10;
882 } while (i > 0);
883 *p = '-';
884 return p;
885 }
886 }
887 }
888
FastHexToBuffer(int i,char * buffer)889 char *FastHexToBuffer(int i, char* buffer) {
890 GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
891
892 static const char *hexdigits = "0123456789abcdef";
893 char *p = buffer + 21;
894 *p-- = '\0';
895 do {
896 *p-- = hexdigits[i & 15]; // mod by 16
897 i >>= 4; // divide by 16
898 } while (i > 0);
899 return p + 1;
900 }
901
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)902 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
903 static const char *hexdigits = "0123456789abcdef";
904 buffer[num_byte] = '\0';
905 for (int i = num_byte - 1; i >= 0; i--) {
906 #ifdef _M_X64
907 // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
908 // block. Given that the uint32 cast was to improve performance on 32-bit
909 // platforms, we use 64-bit '&' directly.
910 buffer[i] = hexdigits[value & 0xf];
911 #else
912 buffer[i] = hexdigits[uint32(value) & 0xf];
913 #endif
914 value >>= 4;
915 }
916 return buffer;
917 }
918
FastHex64ToBuffer(uint64 value,char * buffer)919 char *FastHex64ToBuffer(uint64 value, char* buffer) {
920 return InternalFastHexToBuffer(value, buffer, 16);
921 }
922
FastHex32ToBuffer(uint32 value,char * buffer)923 char *FastHex32ToBuffer(uint32 value, char* buffer) {
924 return InternalFastHexToBuffer(value, buffer, 8);
925 }
926
927 // ----------------------------------------------------------------------
928 // FastInt32ToBufferLeft()
929 // FastUInt32ToBufferLeft()
930 // FastInt64ToBufferLeft()
931 // FastUInt64ToBufferLeft()
932 //
933 // Like the Fast*ToBuffer() functions above, these are intended for speed.
934 // Unlike the Fast*ToBuffer() functions, however, these functions write
935 // their output to the beginning of the buffer (hence the name, as the
936 // output is left-aligned). The caller is responsible for ensuring that
937 // the buffer has enough space to hold the output.
938 //
939 // Returns a pointer to the end of the string (i.e. the null character
940 // terminating the string).
941 // ----------------------------------------------------------------------
942
943 static const char two_ASCII_digits[100][2] = {
944 {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
945 {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
946 {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
947 {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
948 {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
949 {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
950 {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
951 {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
952 {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
953 {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
954 {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
955 {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
956 {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
957 {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
958 {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
959 {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
960 {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
961 {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
962 {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
963 {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
964 };
965
FastUInt32ToBufferLeft(uint32 u,char * buffer)966 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
967 uint32 digits;
968 const char *ASCII_digits = nullptr;
969 // The idea of this implementation is to trim the number of divides to as few
970 // as possible by using multiplication and subtraction rather than mod (%),
971 // and by outputting two digits at a time rather than one.
972 // The huge-number case is first, in the hopes that the compiler will output
973 // that case in one branch-free block of code, and only output conditional
974 // branches into it from below.
975 if (u >= 1000000000) { // >= 1,000,000,000
976 digits = u / 100000000; // 100,000,000
977 ASCII_digits = two_ASCII_digits[digits];
978 buffer[0] = ASCII_digits[0];
979 buffer[1] = ASCII_digits[1];
980 buffer += 2;
981 sublt100_000_000:
982 u -= digits * 100000000; // 100,000,000
983 lt100_000_000:
984 digits = u / 1000000; // 1,000,000
985 ASCII_digits = two_ASCII_digits[digits];
986 buffer[0] = ASCII_digits[0];
987 buffer[1] = ASCII_digits[1];
988 buffer += 2;
989 sublt1_000_000:
990 u -= digits * 1000000; // 1,000,000
991 lt1_000_000:
992 digits = u / 10000; // 10,000
993 ASCII_digits = two_ASCII_digits[digits];
994 buffer[0] = ASCII_digits[0];
995 buffer[1] = ASCII_digits[1];
996 buffer += 2;
997 sublt10_000:
998 u -= digits * 10000; // 10,000
999 lt10_000:
1000 digits = u / 100;
1001 ASCII_digits = two_ASCII_digits[digits];
1002 buffer[0] = ASCII_digits[0];
1003 buffer[1] = ASCII_digits[1];
1004 buffer += 2;
1005 sublt100:
1006 u -= digits * 100;
1007 lt100:
1008 digits = u;
1009 ASCII_digits = two_ASCII_digits[digits];
1010 buffer[0] = ASCII_digits[0];
1011 buffer[1] = ASCII_digits[1];
1012 buffer += 2;
1013 done:
1014 *buffer = 0;
1015 return buffer;
1016 }
1017
1018 if (u < 100) {
1019 digits = u;
1020 if (u >= 10) goto lt100;
1021 *buffer++ = '0' + digits;
1022 goto done;
1023 }
1024 if (u < 10000) { // 10,000
1025 if (u >= 1000) goto lt10_000;
1026 digits = u / 100;
1027 *buffer++ = '0' + digits;
1028 goto sublt100;
1029 }
1030 if (u < 1000000) { // 1,000,000
1031 if (u >= 100000) goto lt1_000_000;
1032 digits = u / 10000; // 10,000
1033 *buffer++ = '0' + digits;
1034 goto sublt10_000;
1035 }
1036 if (u < 100000000) { // 100,000,000
1037 if (u >= 10000000) goto lt100_000_000;
1038 digits = u / 1000000; // 1,000,000
1039 *buffer++ = '0' + digits;
1040 goto sublt1_000_000;
1041 }
1042 // we already know that u < 1,000,000,000
1043 digits = u / 100000000; // 100,000,000
1044 *buffer++ = '0' + digits;
1045 goto sublt100_000_000;
1046 }
1047
FastInt32ToBufferLeft(int32 i,char * buffer)1048 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1049 uint32 u = 0;
1050 if (i < 0) {
1051 *buffer++ = '-';
1052 u -= i;
1053 } else {
1054 u = i;
1055 }
1056 return FastUInt32ToBufferLeft(u, buffer);
1057 }
1058
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1059 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1060 int digits;
1061 const char *ASCII_digits = nullptr;
1062
1063 uint32 u = static_cast<uint32>(u64);
1064 if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1065
1066 uint64 top_11_digits = u64 / 1000000000;
1067 buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1068 u = u64 - (top_11_digits * 1000000000);
1069
1070 digits = u / 10000000; // 10,000,000
1071 GOOGLE_DCHECK_LT(digits, 100);
1072 ASCII_digits = two_ASCII_digits[digits];
1073 buffer[0] = ASCII_digits[0];
1074 buffer[1] = ASCII_digits[1];
1075 buffer += 2;
1076 u -= digits * 10000000; // 10,000,000
1077 digits = u / 100000; // 100,000
1078 ASCII_digits = two_ASCII_digits[digits];
1079 buffer[0] = ASCII_digits[0];
1080 buffer[1] = ASCII_digits[1];
1081 buffer += 2;
1082 u -= digits * 100000; // 100,000
1083 digits = u / 1000; // 1,000
1084 ASCII_digits = two_ASCII_digits[digits];
1085 buffer[0] = ASCII_digits[0];
1086 buffer[1] = ASCII_digits[1];
1087 buffer += 2;
1088 u -= digits * 1000; // 1,000
1089 digits = u / 10;
1090 ASCII_digits = two_ASCII_digits[digits];
1091 buffer[0] = ASCII_digits[0];
1092 buffer[1] = ASCII_digits[1];
1093 buffer += 2;
1094 u -= digits * 10;
1095 digits = u;
1096 *buffer++ = '0' + digits;
1097 *buffer = 0;
1098 return buffer;
1099 }
1100
FastInt64ToBufferLeft(int64 i,char * buffer)1101 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1102 uint64 u = 0;
1103 if (i < 0) {
1104 *buffer++ = '-';
1105 u -= i;
1106 } else {
1107 u = i;
1108 }
1109 return FastUInt64ToBufferLeft(u, buffer);
1110 }
1111
1112 // ----------------------------------------------------------------------
1113 // SimpleItoa()
1114 // Description: converts an integer to a string.
1115 //
1116 // Return value: string
1117 // ----------------------------------------------------------------------
1118
SimpleItoa(int i)1119 string SimpleItoa(int i) {
1120 char buffer[kFastToBufferSize];
1121 return (sizeof(i) == 4) ?
1122 FastInt32ToBuffer(i, buffer) :
1123 FastInt64ToBuffer(i, buffer);
1124 }
1125
SimpleItoa(unsigned int i)1126 string SimpleItoa(unsigned int i) {
1127 char buffer[kFastToBufferSize];
1128 return string(buffer, (sizeof(i) == 4) ?
1129 FastUInt32ToBufferLeft(i, buffer) :
1130 FastUInt64ToBufferLeft(i, buffer));
1131 }
1132
SimpleItoa(long i)1133 string SimpleItoa(long i) {
1134 char buffer[kFastToBufferSize];
1135 return (sizeof(i) == 4) ?
1136 FastInt32ToBuffer(i, buffer) :
1137 FastInt64ToBuffer(i, buffer);
1138 }
1139
SimpleItoa(unsigned long i)1140 string SimpleItoa(unsigned long i) {
1141 char buffer[kFastToBufferSize];
1142 return string(buffer, (sizeof(i) == 4) ?
1143 FastUInt32ToBufferLeft(i, buffer) :
1144 FastUInt64ToBufferLeft(i, buffer));
1145 }
1146
SimpleItoa(long long i)1147 string SimpleItoa(long long i) {
1148 char buffer[kFastToBufferSize];
1149 return (sizeof(i) == 4) ?
1150 FastInt32ToBuffer(i, buffer) :
1151 FastInt64ToBuffer(i, buffer);
1152 }
1153
SimpleItoa(unsigned long long i)1154 string SimpleItoa(unsigned long long i) {
1155 char buffer[kFastToBufferSize];
1156 return string(buffer, (sizeof(i) == 4) ?
1157 FastUInt32ToBufferLeft(i, buffer) :
1158 FastUInt64ToBufferLeft(i, buffer));
1159 }
1160
1161 // ----------------------------------------------------------------------
1162 // SimpleDtoa()
1163 // SimpleFtoa()
1164 // DoubleToBuffer()
1165 // FloatToBuffer()
1166 // We want to print the value without losing precision, but we also do
1167 // not want to print more digits than necessary. This turns out to be
1168 // trickier than it sounds. Numbers like 0.2 cannot be represented
1169 // exactly in binary. If we print 0.2 with a very large precision,
1170 // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1171 // On the other hand, if we set the precision too low, we lose
1172 // significant digits when printing numbers that actually need them.
1173 // It turns out there is no precision value that does the right thing
1174 // for all numbers.
1175 //
1176 // Our strategy is to first try printing with a precision that is never
1177 // over-precise, then parse the result with strtod() to see if it
1178 // matches. If not, we print again with a precision that will always
1179 // give a precise result, but may use more digits than necessary.
1180 //
1181 // An arguably better strategy would be to use the algorithm described
1182 // in "How to Print Floating-Point Numbers Accurately" by Steele &
1183 // White, e.g. as implemented by David M. Gay's dtoa(). It turns out,
1184 // however, that the following implementation is about as fast as
1185 // DMG's code. Furthermore, DMG's code locks mutexes, which means it
1186 // will not scale well on multi-core machines. DMG's code is slightly
1187 // more accurate (in that it will never use more digits than
1188 // necessary), but this is probably irrelevant for most users.
1189 //
1190 // Rob Pike and Ken Thompson also have an implementation of dtoa() in
1191 // third_party/fmt/fltfmt.cc. Their implementation is similar to this
1192 // one in that it makes guesses and then uses strtod() to check them.
1193 // Their implementation is faster because they use their own code to
1194 // generate the digits in the first place rather than use snprintf(),
1195 // thus avoiding format string parsing overhead. However, this makes
1196 // it considerably more complicated than the following implementation,
1197 // and it is embedded in a larger library. If speed turns out to be
1198 // an issue, we could re-implement this in terms of their
1199 // implementation.
1200 // ----------------------------------------------------------------------
1201
SimpleDtoa(double value)1202 string SimpleDtoa(double value) {
1203 char buffer[kDoubleToBufferSize];
1204 return DoubleToBuffer(value, buffer);
1205 }
1206
SimpleFtoa(float value)1207 string SimpleFtoa(float value) {
1208 char buffer[kFloatToBufferSize];
1209 return FloatToBuffer(value, buffer);
1210 }
1211
IsValidFloatChar(char c)1212 static inline bool IsValidFloatChar(char c) {
1213 return ('0' <= c && c <= '9') ||
1214 c == 'e' || c == 'E' ||
1215 c == '+' || c == '-';
1216 }
1217
DelocalizeRadix(char * buffer)1218 void DelocalizeRadix(char* buffer) {
1219 // Fast check: if the buffer has a normal decimal point, assume no
1220 // translation is needed.
1221 if (strchr(buffer, '.') != nullptr) return;
1222
1223 // Find the first unknown character.
1224 while (IsValidFloatChar(*buffer)) ++buffer;
1225
1226 if (*buffer == '\0') {
1227 // No radix character found.
1228 return;
1229 }
1230
1231 // We are now pointing at the locale-specific radix character. Replace it
1232 // with '.'.
1233 *buffer = '.';
1234 ++buffer;
1235
1236 if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1237 // It appears the radix was a multi-byte character. We need to remove the
1238 // extra bytes.
1239 char* target = buffer;
1240 do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1241 memmove(target, buffer, strlen(buffer) + 1);
1242 }
1243 }
1244
DoubleToBuffer(double value,char * buffer)1245 char* DoubleToBuffer(double value, char* buffer) {
1246 // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1247 // platforms these days. Just in case some system exists where DBL_DIG
1248 // is significantly larger -- and risks overflowing our buffer -- we have
1249 // this assert.
1250 GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1251
1252 if (value == std::numeric_limits<double>::infinity()) {
1253 strcpy(buffer, "inf");
1254 return buffer;
1255 } else if (value == -std::numeric_limits<double>::infinity()) {
1256 strcpy(buffer, "-inf");
1257 return buffer;
1258 } else if (std::isnan(value)) {
1259 strcpy(buffer, "nan");
1260 return buffer;
1261 }
1262
1263 int snprintf_result =
1264 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1265
1266 // The snprintf should never overflow because the buffer is significantly
1267 // larger than the precision we asked for.
1268 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1269
1270 // We need to make parsed_value volatile in order to force the compiler to
1271 // write it out to the stack. Otherwise, it may keep the value in a
1272 // register, and if it does that, it may keep it as a long double instead
1273 // of a double. This long double may have extra bits that make it compare
1274 // unequal to "value" even though it would be exactly equal if it were
1275 // truncated to a double.
1276 volatile double parsed_value = internal::NoLocaleStrtod(buffer, nullptr);
1277 if (parsed_value != value) {
1278 int snprintf_result =
1279 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1280
1281 // Should never overflow; see above.
1282 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1283 }
1284
1285 DelocalizeRadix(buffer);
1286 return buffer;
1287 }
1288
memcasecmp(const char * s1,const char * s2,size_t len)1289 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1290 const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1291 const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1292
1293 for ( int i = 0; i < len; i++ ) {
1294 const int diff =
1295 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1296 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1297 if (diff != 0) return diff;
1298 }
1299 return 0;
1300 }
1301
CaseEqual(StringPiece s1,StringPiece s2)1302 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1303 if (s1.size() != s2.size()) return false;
1304 return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1305 }
1306
safe_strtob(StringPiece str,bool * value)1307 bool safe_strtob(StringPiece str, bool* value) {
1308 GOOGLE_CHECK(value != nullptr) << "nullptr output boolean given.";
1309 if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1310 CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1311 CaseEqual(str, "1")) {
1312 *value = true;
1313 return true;
1314 }
1315 if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1316 CaseEqual(str, "no") || CaseEqual(str, "n") ||
1317 CaseEqual(str, "0")) {
1318 *value = false;
1319 return true;
1320 }
1321 return false;
1322 }
1323
safe_strtof(const char * str,float * value)1324 bool safe_strtof(const char* str, float* value) {
1325 char* endptr;
1326 errno = 0; // errno only gets set on errors
1327 #if defined(_WIN32) || defined (__hpux) // has no strtof()
1328 *value = internal::NoLocaleStrtod(str, &endptr);
1329 #else
1330 *value = strtof(str, &endptr);
1331 #endif
1332 return *str != 0 && *endptr == 0 && errno == 0;
1333 }
1334
safe_strtod(const char * str,double * value)1335 bool safe_strtod(const char* str, double* value) {
1336 char* endptr;
1337 *value = internal::NoLocaleStrtod(str, &endptr);
1338 if (endptr != str) {
1339 while (ascii_isspace(*endptr)) ++endptr;
1340 }
1341 // Ignore range errors from strtod. The values it
1342 // returns on underflow and overflow are the right
1343 // fallback in a robust setting.
1344 return *str != '\0' && *endptr == '\0';
1345 }
1346
safe_strto32(const string & str,int32 * value)1347 bool safe_strto32(const string& str, int32* value) {
1348 return safe_int_internal(str, value);
1349 }
1350
safe_strtou32(const string & str,uint32 * value)1351 bool safe_strtou32(const string& str, uint32* value) {
1352 return safe_uint_internal(str, value);
1353 }
1354
safe_strto64(const string & str,int64 * value)1355 bool safe_strto64(const string& str, int64* value) {
1356 return safe_int_internal(str, value);
1357 }
1358
safe_strtou64(const string & str,uint64 * value)1359 bool safe_strtou64(const string& str, uint64* value) {
1360 return safe_uint_internal(str, value);
1361 }
1362
FloatToBuffer(float value,char * buffer)1363 char* FloatToBuffer(float value, char* buffer) {
1364 // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1365 // platforms these days. Just in case some system exists where FLT_DIG
1366 // is significantly larger -- and risks overflowing our buffer -- we have
1367 // this assert.
1368 GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1369
1370 if (value == std::numeric_limits<double>::infinity()) {
1371 strcpy(buffer, "inf");
1372 return buffer;
1373 } else if (value == -std::numeric_limits<double>::infinity()) {
1374 strcpy(buffer, "-inf");
1375 return buffer;
1376 } else if (std::isnan(value)) {
1377 strcpy(buffer, "nan");
1378 return buffer;
1379 }
1380
1381 int snprintf_result =
1382 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1383
1384 // The snprintf should never overflow because the buffer is significantly
1385 // larger than the precision we asked for.
1386 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1387
1388 float parsed_value;
1389 if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1390 int snprintf_result =
1391 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value);
1392
1393 // Should never overflow; see above.
1394 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1395 }
1396
1397 DelocalizeRadix(buffer);
1398 return buffer;
1399 }
1400
1401 namespace strings {
1402
AlphaNum(strings::Hex hex)1403 AlphaNum::AlphaNum(strings::Hex hex) {
1404 char *const end = &digits[kFastToBufferSize];
1405 char *writer = end;
1406 uint64 value = hex.value;
1407 uint64 width = hex.spec;
1408 // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1409 // where 0x10000 is the smallest hex number that is as wide as the user
1410 // asked for.
1411 uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1412 static const char hexdigits[] = "0123456789abcdef";
1413 do {
1414 *--writer = hexdigits[value & 0xF];
1415 value >>= 4;
1416 mask >>= 4;
1417 } while (mask != 0);
1418 piece_data_ = writer;
1419 piece_size_ = end - writer;
1420 }
1421
1422 } // namespace strings
1423
1424 // ----------------------------------------------------------------------
1425 // StrCat()
1426 // This merges the given strings or integers, with no delimiter. This
1427 // is designed to be the fastest possible way to construct a string out
1428 // of a mix of raw C strings, C++ strings, and integer values.
1429 // ----------------------------------------------------------------------
1430
1431 // Append is merely a version of memcpy that returns the address of the byte
1432 // after the area just overwritten. It comes in multiple flavors to minimize
1433 // call overhead.
Append1(char * out,const AlphaNum & x)1434 static char *Append1(char *out, const AlphaNum &x) {
1435 if (x.size() > 0) {
1436 memcpy(out, x.data(), x.size());
1437 out += x.size();
1438 }
1439 return out;
1440 }
1441
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1442 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1443 if (x1.size() > 0) {
1444 memcpy(out, x1.data(), x1.size());
1445 out += x1.size();
1446 }
1447 if (x2.size() > 0) {
1448 memcpy(out, x2.data(), x2.size());
1449 out += x2.size();
1450 }
1451 return out;
1452 }
1453
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1454 static char *Append4(char *out, const AlphaNum &x1, const AlphaNum &x2,
1455 const AlphaNum &x3, const AlphaNum &x4) {
1456 if (x1.size() > 0) {
1457 memcpy(out, x1.data(), x1.size());
1458 out += x1.size();
1459 }
1460 if (x2.size() > 0) {
1461 memcpy(out, x2.data(), x2.size());
1462 out += x2.size();
1463 }
1464 if (x3.size() > 0) {
1465 memcpy(out, x3.data(), x3.size());
1466 out += x3.size();
1467 }
1468 if (x4.size() > 0) {
1469 memcpy(out, x4.data(), x4.size());
1470 out += x4.size();
1471 }
1472 return out;
1473 }
1474
StrCat(const AlphaNum & a,const AlphaNum & b)1475 string StrCat(const AlphaNum &a, const AlphaNum &b) {
1476 string result;
1477 result.resize(a.size() + b.size());
1478 char *const begin = &*result.begin();
1479 char *out = Append2(begin, a, b);
1480 GOOGLE_DCHECK_EQ(out, begin + result.size());
1481 return result;
1482 }
1483
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1484 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1485 string result;
1486 result.resize(a.size() + b.size() + c.size());
1487 char *const begin = &*result.begin();
1488 char *out = Append2(begin, a, b);
1489 out = Append1(out, c);
1490 GOOGLE_DCHECK_EQ(out, begin + result.size());
1491 return result;
1492 }
1493
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1494 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1495 const AlphaNum &d) {
1496 string result;
1497 result.resize(a.size() + b.size() + c.size() + d.size());
1498 char *const begin = &*result.begin();
1499 char *out = Append4(begin, a, b, c, d);
1500 GOOGLE_DCHECK_EQ(out, begin + result.size());
1501 return result;
1502 }
1503
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1504 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1505 const AlphaNum &d, const AlphaNum &e) {
1506 string result;
1507 result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1508 char *const begin = &*result.begin();
1509 char *out = Append4(begin, a, b, c, d);
1510 out = Append1(out, e);
1511 GOOGLE_DCHECK_EQ(out, begin + result.size());
1512 return result;
1513 }
1514
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1515 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1516 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1517 string result;
1518 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1519 f.size());
1520 char *const begin = &*result.begin();
1521 char *out = Append4(begin, a, b, c, d);
1522 out = Append2(out, e, f);
1523 GOOGLE_DCHECK_EQ(out, begin + result.size());
1524 return result;
1525 }
1526
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1527 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1528 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1529 const AlphaNum &g) {
1530 string result;
1531 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1532 f.size() + g.size());
1533 char *const begin = &*result.begin();
1534 char *out = Append4(begin, a, b, c, d);
1535 out = Append2(out, e, f);
1536 out = Append1(out, g);
1537 GOOGLE_DCHECK_EQ(out, begin + result.size());
1538 return result;
1539 }
1540
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1541 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1542 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1543 const AlphaNum &g, const AlphaNum &h) {
1544 string result;
1545 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1546 f.size() + g.size() + h.size());
1547 char *const begin = &*result.begin();
1548 char *out = Append4(begin, a, b, c, d);
1549 out = Append4(out, e, f, g, h);
1550 GOOGLE_DCHECK_EQ(out, begin + result.size());
1551 return result;
1552 }
1553
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1554 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1555 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1556 const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1557 string result;
1558 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1559 f.size() + g.size() + h.size() + i.size());
1560 char *const begin = &*result.begin();
1561 char *out = Append4(begin, a, b, c, d);
1562 out = Append4(out, e, f, g, h);
1563 out = Append1(out, i);
1564 GOOGLE_DCHECK_EQ(out, begin + result.size());
1565 return result;
1566 }
1567
1568 // It's possible to call StrAppend with a char * pointer that is partway into
1569 // the string we're appending to. However the results of this are random.
1570 // Therefore, check for this in debug mode. Use unsigned math so we only have
1571 // to do one comparison.
1572 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1573 GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1574 uintptr_t((dest).size()))
1575
StrAppend(string * result,const AlphaNum & a)1576 void StrAppend(string *result, const AlphaNum &a) {
1577 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1578 result->append(a.data(), a.size());
1579 }
1580
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b)1581 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1582 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1583 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1584 string::size_type old_size = result->size();
1585 result->resize(old_size + a.size() + b.size());
1586 char *const begin = &*result->begin();
1587 char *out = Append2(begin + old_size, a, b);
1588 GOOGLE_DCHECK_EQ(out, begin + result->size());
1589 }
1590
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1591 void StrAppend(string *result,
1592 const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1593 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1594 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1595 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1596 string::size_type old_size = result->size();
1597 result->resize(old_size + a.size() + b.size() + c.size());
1598 char *const begin = &*result->begin();
1599 char *out = Append2(begin + old_size, a, b);
1600 out = Append1(out, c);
1601 GOOGLE_DCHECK_EQ(out, begin + result->size());
1602 }
1603
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1604 void StrAppend(string *result,
1605 const AlphaNum &a, const AlphaNum &b,
1606 const AlphaNum &c, const AlphaNum &d) {
1607 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1608 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1609 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1610 GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1611 string::size_type old_size = result->size();
1612 result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1613 char *const begin = &*result->begin();
1614 char *out = Append4(begin + old_size, a, b, c, d);
1615 GOOGLE_DCHECK_EQ(out, begin + result->size());
1616 }
1617
GlobalReplaceSubstring(const string & substring,const string & replacement,string * s)1618 int GlobalReplaceSubstring(const string& substring,
1619 const string& replacement,
1620 string* s) {
1621 GOOGLE_CHECK(s != nullptr);
1622 if (s->empty() || substring.empty())
1623 return 0;
1624 string tmp;
1625 int num_replacements = 0;
1626 int pos = 0;
1627 for (int match_pos = s->find(substring.data(), pos, substring.length());
1628 match_pos != string::npos;
1629 pos = match_pos + substring.length(),
1630 match_pos = s->find(substring.data(), pos, substring.length())) {
1631 ++num_replacements;
1632 // Append the original content before the match.
1633 tmp.append(*s, pos, match_pos - pos);
1634 // Append the replacement for the match.
1635 tmp.append(replacement.begin(), replacement.end());
1636 }
1637 // Append the content after the last match. If no replacements were made, the
1638 // original string is left untouched.
1639 if (num_replacements > 0) {
1640 tmp.append(*s, pos, s->length() - pos);
1641 s->swap(tmp);
1642 }
1643 return num_replacements;
1644 }
1645
CalculateBase64EscapedLen(int input_len,bool do_padding)1646 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1647 // Base64 encodes three bytes of input at a time. If the input is not
1648 // divisible by three, we pad as appropriate.
1649 //
1650 // (from http://tools.ietf.org/html/rfc3548)
1651 // Special processing is performed if fewer than 24 bits are available
1652 // at the end of the data being encoded. A full encoding quantum is
1653 // always completed at the end of a quantity. When fewer than 24 input
1654 // bits are available in an input group, zero bits are added (on the
1655 // right) to form an integral number of 6-bit groups. Padding at the
1656 // end of the data is performed using the '=' character. Since all base
1657 // 64 input is an integral number of octets, only the following cases
1658 // can arise:
1659
1660
1661 // Base64 encodes each three bytes of input into four bytes of output.
1662 int len = (input_len / 3) * 4;
1663
1664 if (input_len % 3 == 0) {
1665 // (from http://tools.ietf.org/html/rfc3548)
1666 // (1) the final quantum of encoding input is an integral multiple of 24
1667 // bits; here, the final unit of encoded output will be an integral
1668 // multiple of 4 characters with no "=" padding,
1669 } else if (input_len % 3 == 1) {
1670 // (from http://tools.ietf.org/html/rfc3548)
1671 // (2) the final quantum of encoding input is exactly 8 bits; here, the
1672 // final unit of encoded output will be two characters followed by two
1673 // "=" padding characters, or
1674 len += 2;
1675 if (do_padding) {
1676 len += 2;
1677 }
1678 } else { // (input_len % 3 == 2)
1679 // (from http://tools.ietf.org/html/rfc3548)
1680 // (3) the final quantum of encoding input is exactly 16 bits; here, the
1681 // final unit of encoded output will be three characters followed by one
1682 // "=" padding character.
1683 len += 3;
1684 if (do_padding) {
1685 len += 1;
1686 }
1687 }
1688
1689 assert(len >= input_len); // make sure we didn't overflow
1690 return len;
1691 }
1692
1693 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1694 int CalculateBase64EscapedLen(int input_len) {
1695 return CalculateBase64EscapedLen(input_len, true);
1696 }
1697
1698 // ----------------------------------------------------------------------
1699 // int Base64Unescape() - base64 decoder
1700 // int Base64Escape() - base64 encoder
1701 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1702 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1703 //
1704 // Check out
1705 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1706 // care about is that...
1707 // Take the encoded stuff in groups of 4 characters and turn each
1708 // character into a code 0 to 63 thus:
1709 // A-Z map to 0 to 25
1710 // a-z map to 26 to 51
1711 // 0-9 map to 52 to 61
1712 // +(- for WebSafe) maps to 62
1713 // /(_ for WebSafe) maps to 63
1714 // There will be four numbers, all less than 64 which can be represented
1715 // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1716 // Arrange the 6 digit binary numbers into three bytes as such:
1717 // aaaaaabb bbbbcccc ccdddddd
1718 // Equals signs (one or two) are used at the end of the encoded block to
1719 // indicate that the text was not an integer multiple of three bytes long.
1720 // ----------------------------------------------------------------------
1721
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1722 int Base64UnescapeInternal(const char *src_param, int szsrc,
1723 char *dest, int szdest,
1724 const signed char* unbase64) {
1725 static const char kPad64Equals = '=';
1726 static const char kPad64Dot = '.';
1727
1728 int decode = 0;
1729 int destidx = 0;
1730 int state = 0;
1731 unsigned int ch = 0;
1732 unsigned int temp = 0;
1733
1734 // If "char" is signed by default, using *src as an array index results in
1735 // accessing negative array elements. Treat the input as a pointer to
1736 // unsigned char to avoid this.
1737 const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1738
1739 // The GET_INPUT macro gets the next input character, skipping
1740 // over any whitespace, and stopping when we reach the end of the
1741 // string or when we read any non-data character. The arguments are
1742 // an arbitrary identifier (used as a label for goto) and the number
1743 // of data bytes that must remain in the input to avoid aborting the
1744 // loop.
1745 #define GET_INPUT(label, remain) \
1746 label: \
1747 --szsrc; \
1748 ch = *src++; \
1749 decode = unbase64[ch]; \
1750 if (decode < 0) { \
1751 if (ascii_isspace(ch) && szsrc >= remain) \
1752 goto label; \
1753 state = 4 - remain; \
1754 break; \
1755 }
1756
1757 // if dest is null, we're just checking to see if it's legal input
1758 // rather than producing output. (I suspect this could just be done
1759 // with a regexp...). We duplicate the loop so this test can be
1760 // outside it instead of in every iteration.
1761
1762 if (dest) {
1763 // This loop consumes 4 input bytes and produces 3 output bytes
1764 // per iteration. We can't know at the start that there is enough
1765 // data left in the string for a full iteration, so the loop may
1766 // break out in the middle; if so 'state' will be set to the
1767 // number of input bytes read.
1768
1769 while (szsrc >= 4) {
1770 // We'll start by optimistically assuming that the next four
1771 // bytes of the string (src[0..3]) are four good data bytes
1772 // (that is, no nulls, whitespace, padding chars, or illegal
1773 // chars). We need to test src[0..2] for nulls individually
1774 // before constructing temp to preserve the property that we
1775 // never read past a null in the string (no matter how long
1776 // szsrc claims the string is).
1777
1778 if (!src[0] || !src[1] || !src[2] ||
1779 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1780 (unsigned(unbase64[src[1]]) << 12) |
1781 (unsigned(unbase64[src[2]]) << 6) |
1782 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1783 // Iff any of those four characters was bad (null, illegal,
1784 // whitespace, padding), then temp's high bit will be set
1785 // (because unbase64[] is -1 for all bad characters).
1786 //
1787 // We'll back up and resort to the slower decoder, which knows
1788 // how to handle those cases.
1789
1790 GET_INPUT(first, 4);
1791 temp = decode;
1792 GET_INPUT(second, 3);
1793 temp = (temp << 6) | decode;
1794 GET_INPUT(third, 2);
1795 temp = (temp << 6) | decode;
1796 GET_INPUT(fourth, 1);
1797 temp = (temp << 6) | decode;
1798 } else {
1799 // We really did have four good data bytes, so advance four
1800 // characters in the string.
1801
1802 szsrc -= 4;
1803 src += 4;
1804 decode = -1;
1805 ch = '\0';
1806 }
1807
1808 // temp has 24 bits of input, so write that out as three bytes.
1809
1810 if (destidx+3 > szdest) return -1;
1811 dest[destidx+2] = temp;
1812 temp >>= 8;
1813 dest[destidx+1] = temp;
1814 temp >>= 8;
1815 dest[destidx] = temp;
1816 destidx += 3;
1817 }
1818 } else {
1819 while (szsrc >= 4) {
1820 if (!src[0] || !src[1] || !src[2] ||
1821 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1822 (unsigned(unbase64[src[1]]) << 12) |
1823 (unsigned(unbase64[src[2]]) << 6) |
1824 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1825 GET_INPUT(first_no_dest, 4);
1826 GET_INPUT(second_no_dest, 3);
1827 GET_INPUT(third_no_dest, 2);
1828 GET_INPUT(fourth_no_dest, 1);
1829 } else {
1830 szsrc -= 4;
1831 src += 4;
1832 decode = -1;
1833 ch = '\0';
1834 }
1835 destidx += 3;
1836 }
1837 }
1838
1839 #undef GET_INPUT
1840
1841 // if the loop terminated because we read a bad character, return
1842 // now.
1843 if (decode < 0 && ch != '\0' &&
1844 ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1845 return -1;
1846
1847 if (ch == kPad64Equals || ch == kPad64Dot) {
1848 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1849 // look at it again when we count to check for the proper number of
1850 // equals signs at the end.
1851 ++szsrc;
1852 --src;
1853 } else {
1854 // This loop consumes 1 input byte per iteration. It's used to
1855 // clean up the 0-3 input bytes remaining when the first, faster
1856 // loop finishes. 'temp' contains the data from 'state' input
1857 // characters read by the first loop.
1858 while (szsrc > 0) {
1859 --szsrc;
1860 ch = *src++;
1861 decode = unbase64[ch];
1862 if (decode < 0) {
1863 if (ascii_isspace(ch)) {
1864 continue;
1865 } else if (ch == '\0') {
1866 break;
1867 } else if (ch == kPad64Equals || ch == kPad64Dot) {
1868 // back up one character; we'll read it again when we check
1869 // for the correct number of pad characters at the end.
1870 ++szsrc;
1871 --src;
1872 break;
1873 } else {
1874 return -1;
1875 }
1876 }
1877
1878 // Each input character gives us six bits of output.
1879 temp = (temp << 6) | decode;
1880 ++state;
1881 if (state == 4) {
1882 // If we've accumulated 24 bits of output, write that out as
1883 // three bytes.
1884 if (dest) {
1885 if (destidx+3 > szdest) return -1;
1886 dest[destidx+2] = temp;
1887 temp >>= 8;
1888 dest[destidx+1] = temp;
1889 temp >>= 8;
1890 dest[destidx] = temp;
1891 }
1892 destidx += 3;
1893 state = 0;
1894 temp = 0;
1895 }
1896 }
1897 }
1898
1899 // Process the leftover data contained in 'temp' at the end of the input.
1900 int expected_equals = 0;
1901 switch (state) {
1902 case 0:
1903 // Nothing left over; output is a multiple of 3 bytes.
1904 break;
1905
1906 case 1:
1907 // Bad input; we have 6 bits left over.
1908 return -1;
1909
1910 case 2:
1911 // Produce one more output byte from the 12 input bits we have left.
1912 if (dest) {
1913 if (destidx+1 > szdest) return -1;
1914 temp >>= 4;
1915 dest[destidx] = temp;
1916 }
1917 ++destidx;
1918 expected_equals = 2;
1919 break;
1920
1921 case 3:
1922 // Produce two more output bytes from the 18 input bits we have left.
1923 if (dest) {
1924 if (destidx+2 > szdest) return -1;
1925 temp >>= 2;
1926 dest[destidx+1] = temp;
1927 temp >>= 8;
1928 dest[destidx] = temp;
1929 }
1930 destidx += 2;
1931 expected_equals = 1;
1932 break;
1933
1934 default:
1935 // state should have no other values at this point.
1936 GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1937 }
1938
1939 // The remainder of the string should be all whitespace, mixed with
1940 // exactly 0 equals signs, or exactly 'expected_equals' equals
1941 // signs. (Always accepting 0 equals signs is a google extension
1942 // not covered in the RFC, as is accepting dot as the pad character.)
1943
1944 int equals = 0;
1945 while (szsrc > 0 && *src) {
1946 if (*src == kPad64Equals || *src == kPad64Dot)
1947 ++equals;
1948 else if (!ascii_isspace(*src))
1949 return -1;
1950 --szsrc;
1951 ++src;
1952 }
1953
1954 return (equals == 0 || equals == expected_equals) ? destidx : -1;
1955 }
1956
1957 // The arrays below were generated by the following code
1958 // #include <sys/time.h>
1959 // #include <stdlib.h>
1960 // #include <string.h>
1961 // #include <stdio.h>
1962 // main()
1963 // {
1964 // static const char Base64[] =
1965 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1966 // const char *pos;
1967 // int idx, i, j;
1968 // printf(" ");
1969 // for (i = 0; i < 255; i += 8) {
1970 // for (j = i; j < i + 8; j++) {
1971 // pos = strchr(Base64, j);
1972 // if ((pos == nullptr) || (j == 0))
1973 // idx = -1;
1974 // else
1975 // idx = pos - Base64;
1976 // if (idx == -1)
1977 // printf(" %2d, ", idx);
1978 // else
1979 // printf(" %2d/""*%c*""/,", idx, j);
1980 // }
1981 // printf("\n ");
1982 // }
1983 // }
1984 //
1985 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1986 // tables from the functions below.
1987 static const signed char kUnBase64[] = {
1988 -1, -1, -1, -1, -1, -1, -1, -1,
1989 -1, -1, -1, -1, -1, -1, -1, -1,
1990 -1, -1, -1, -1, -1, -1, -1, -1,
1991 -1, -1, -1, -1, -1, -1, -1, -1,
1992 -1, -1, -1, -1, -1, -1, -1, -1,
1993 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
1994 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1995 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
1996 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
1997 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1998 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1999 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
2000 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2001 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2002 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2003 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
2004 -1, -1, -1, -1, -1, -1, -1, -1,
2005 -1, -1, -1, -1, -1, -1, -1, -1,
2006 -1, -1, -1, -1, -1, -1, -1, -1,
2007 -1, -1, -1, -1, -1, -1, -1, -1,
2008 -1, -1, -1, -1, -1, -1, -1, -1,
2009 -1, -1, -1, -1, -1, -1, -1, -1,
2010 -1, -1, -1, -1, -1, -1, -1, -1,
2011 -1, -1, -1, -1, -1, -1, -1, -1,
2012 -1, -1, -1, -1, -1, -1, -1, -1,
2013 -1, -1, -1, -1, -1, -1, -1, -1,
2014 -1, -1, -1, -1, -1, -1, -1, -1,
2015 -1, -1, -1, -1, -1, -1, -1, -1,
2016 -1, -1, -1, -1, -1, -1, -1, -1,
2017 -1, -1, -1, -1, -1, -1, -1, -1,
2018 -1, -1, -1, -1, -1, -1, -1, -1,
2019 -1, -1, -1, -1, -1, -1, -1, -1
2020 };
2021 static const signed char kUnWebSafeBase64[] = {
2022 -1, -1, -1, -1, -1, -1, -1, -1,
2023 -1, -1, -1, -1, -1, -1, -1, -1,
2024 -1, -1, -1, -1, -1, -1, -1, -1,
2025 -1, -1, -1, -1, -1, -1, -1, -1,
2026 -1, -1, -1, -1, -1, -1, -1, -1,
2027 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
2028 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2029 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
2030 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
2031 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2032 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2033 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
2034 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2035 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2036 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2037 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
2038 -1, -1, -1, -1, -1, -1, -1, -1,
2039 -1, -1, -1, -1, -1, -1, -1, -1,
2040 -1, -1, -1, -1, -1, -1, -1, -1,
2041 -1, -1, -1, -1, -1, -1, -1, -1,
2042 -1, -1, -1, -1, -1, -1, -1, -1,
2043 -1, -1, -1, -1, -1, -1, -1, -1,
2044 -1, -1, -1, -1, -1, -1, -1, -1,
2045 -1, -1, -1, -1, -1, -1, -1, -1,
2046 -1, -1, -1, -1, -1, -1, -1, -1,
2047 -1, -1, -1, -1, -1, -1, -1, -1,
2048 -1, -1, -1, -1, -1, -1, -1, -1,
2049 -1, -1, -1, -1, -1, -1, -1, -1,
2050 -1, -1, -1, -1, -1, -1, -1, -1,
2051 -1, -1, -1, -1, -1, -1, -1, -1,
2052 -1, -1, -1, -1, -1, -1, -1, -1,
2053 -1, -1, -1, -1, -1, -1, -1, -1
2054 };
2055
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2056 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2057 return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2058 }
2059
Base64UnescapeInternal(const char * src,int slen,string * dest,const signed char * unbase64)2060 static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
2061 const signed char* unbase64) {
2062 // Determine the size of the output string. Base64 encodes every 3 bytes into
2063 // 4 characters. any leftover chars are added directly for good measure.
2064 // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2065 const int dest_len = 3 * (slen / 4) + (slen % 4);
2066
2067 dest->resize(dest_len);
2068
2069 // We are getting the destination buffer by getting the beginning of the
2070 // string and converting it into a char *.
2071 const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2072 dest_len, unbase64);
2073 if (len < 0) {
2074 dest->clear();
2075 return false;
2076 }
2077
2078 // could be shorter if there was padding
2079 GOOGLE_DCHECK_LE(len, dest_len);
2080 dest->erase(len);
2081
2082 return true;
2083 }
2084
Base64Unescape(StringPiece src,string * dest)2085 bool Base64Unescape(StringPiece src, string* dest) {
2086 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2087 }
2088
WebSafeBase64Unescape(StringPiece src,string * dest)2089 bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2090 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2091 }
2092
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2093 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2094 char *dest, int szdest, const char *base64,
2095 bool do_padding) {
2096 static const char kPad64 = '=';
2097
2098 if (szsrc <= 0) return 0;
2099
2100 if (szsrc * 4 > szdest * 3) return 0;
2101
2102 char *cur_dest = dest;
2103 const unsigned char *cur_src = src;
2104
2105 char *limit_dest = dest + szdest;
2106 const unsigned char *limit_src = src + szsrc;
2107
2108 // Three bytes of data encodes to four characters of cyphertext.
2109 // So we can pump through three-byte chunks atomically.
2110 while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits
2111 uint32 in = BigEndian::Load32(cur_src) >> 8;
2112
2113 cur_dest[0] = base64[in >> 18];
2114 in &= 0x3FFFF;
2115 cur_dest[1] = base64[in >> 12];
2116 in &= 0xFFF;
2117 cur_dest[2] = base64[in >> 6];
2118 in &= 0x3F;
2119 cur_dest[3] = base64[in];
2120
2121 cur_dest += 4;
2122 cur_src += 3;
2123 }
2124 // To save time, we didn't update szdest or szsrc in the loop. So do it now.
2125 szdest = limit_dest - cur_dest;
2126 szsrc = limit_src - cur_src;
2127
2128 /* now deal with the tail (<=3 bytes) */
2129 switch (szsrc) {
2130 case 0:
2131 // Nothing left; nothing more to do.
2132 break;
2133 case 1: {
2134 // One byte left: this encodes to two characters, and (optionally)
2135 // two pad characters to round out the four-character cypherblock.
2136 if ((szdest -= 2) < 0) return 0;
2137 uint32 in = cur_src[0];
2138 cur_dest[0] = base64[in >> 2];
2139 in &= 0x3;
2140 cur_dest[1] = base64[in << 4];
2141 cur_dest += 2;
2142 if (do_padding) {
2143 if ((szdest -= 2) < 0) return 0;
2144 cur_dest[0] = kPad64;
2145 cur_dest[1] = kPad64;
2146 cur_dest += 2;
2147 }
2148 break;
2149 }
2150 case 2: {
2151 // Two bytes left: this encodes to three characters, and (optionally)
2152 // one pad character to round out the four-character cypherblock.
2153 if ((szdest -= 3) < 0) return 0;
2154 uint32 in = BigEndian::Load16(cur_src);
2155 cur_dest[0] = base64[in >> 10];
2156 in &= 0x3FF;
2157 cur_dest[1] = base64[in >> 4];
2158 in &= 0x00F;
2159 cur_dest[2] = base64[in << 2];
2160 cur_dest += 3;
2161 if (do_padding) {
2162 if ((szdest -= 1) < 0) return 0;
2163 cur_dest[0] = kPad64;
2164 cur_dest += 1;
2165 }
2166 break;
2167 }
2168 case 3: {
2169 // Three bytes left: same as in the big loop above. We can't do this in
2170 // the loop because the loop above always reads 4 bytes, and the fourth
2171 // byte is past the end of the input.
2172 if ((szdest -= 4) < 0) return 0;
2173 uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2174 cur_dest[0] = base64[in >> 18];
2175 in &= 0x3FFFF;
2176 cur_dest[1] = base64[in >> 12];
2177 in &= 0xFFF;
2178 cur_dest[2] = base64[in >> 6];
2179 in &= 0x3F;
2180 cur_dest[3] = base64[in];
2181 cur_dest += 4;
2182 break;
2183 }
2184 default:
2185 // Should not be reached: blocks of 4 bytes are handled
2186 // in the while loop before this switch statement.
2187 GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2188 break;
2189 }
2190 return (cur_dest - dest);
2191 }
2192
2193 static const char kBase64Chars[] =
2194 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2195
2196 static const char kWebSafeBase64Chars[] =
2197 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2198
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2199 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2200 return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2201 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2202 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2203 int szdest, bool do_padding) {
2204 return Base64EscapeInternal(src, szsrc, dest, szdest,
2205 kWebSafeBase64Chars, do_padding);
2206 }
2207
Base64EscapeInternal(const unsigned char * src,int szsrc,string * dest,bool do_padding,const char * base64_chars)2208 void Base64EscapeInternal(const unsigned char* src, int szsrc,
2209 string* dest, bool do_padding,
2210 const char* base64_chars) {
2211 const int calc_escaped_size =
2212 CalculateBase64EscapedLen(szsrc, do_padding);
2213 dest->resize(calc_escaped_size);
2214 const int escaped_len = Base64EscapeInternal(src, szsrc,
2215 string_as_array(dest),
2216 dest->size(),
2217 base64_chars,
2218 do_padding);
2219 GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2220 dest->erase(escaped_len);
2221 }
2222
Base64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2223 void Base64Escape(const unsigned char *src, int szsrc,
2224 string* dest, bool do_padding) {
2225 Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2226 }
2227
WebSafeBase64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2228 void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2229 string *dest, bool do_padding) {
2230 Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2231 }
2232
Base64Escape(StringPiece src,string * dest)2233 void Base64Escape(StringPiece src, string* dest) {
2234 Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2235 src.size(), dest, true);
2236 }
2237
WebSafeBase64Escape(StringPiece src,string * dest)2238 void WebSafeBase64Escape(StringPiece src, string* dest) {
2239 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2240 src.size(), dest, false);
2241 }
2242
WebSafeBase64EscapeWithPadding(StringPiece src,string * dest)2243 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2244 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2245 src.size(), dest, true);
2246 }
2247
2248 // Helper to append a Unicode code point to a string as UTF8, without bringing
2249 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2250 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2251 uint32 tmp = 0;
2252 int len = 0;
2253 if (code_point <= 0x7f) {
2254 tmp = code_point;
2255 len = 1;
2256 } else if (code_point <= 0x07ff) {
2257 tmp = 0x0000c080 |
2258 ((code_point & 0x07c0) << 2) |
2259 (code_point & 0x003f);
2260 len = 2;
2261 } else if (code_point <= 0xffff) {
2262 tmp = 0x00e08080 |
2263 ((code_point & 0xf000) << 4) |
2264 ((code_point & 0x0fc0) << 2) |
2265 (code_point & 0x003f);
2266 len = 3;
2267 } else {
2268 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2269 // normally only defined up to there as well.
2270 tmp = 0xf0808080 |
2271 ((code_point & 0x1c0000) << 6) |
2272 ((code_point & 0x03f000) << 4) |
2273 ((code_point & 0x000fc0) << 2) |
2274 (code_point & 0x003f);
2275 len = 4;
2276 }
2277 tmp = ghtonl(tmp);
2278 memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2279 return len;
2280 }
2281
2282 // Table of UTF-8 character lengths, based on first byte
2283 static const unsigned char kUTF8LenTbl[256] = {
2284 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2285 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2286 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2287 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2288 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2289 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2290
2291 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2292 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2293 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2294 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2295 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2296 3, 3, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2297
2298 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2299 int UTF8FirstLetterNumBytes(const char* src, int len) {
2300 if (len == 0) {
2301 return 0;
2302 }
2303 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2304 }
2305
2306 // ----------------------------------------------------------------------
2307 // CleanStringLineEndings()
2308 // Clean up a multi-line string to conform to Unix line endings.
2309 // Reads from src and appends to dst, so usually dst should be empty.
2310 //
2311 // If there is no line ending at the end of a non-empty string, it can
2312 // be added automatically.
2313 //
2314 // Four different types of input are correctly handled:
2315 //
2316 // - Unix/Linux files: line ending is LF: pass through unchanged
2317 //
2318 // - DOS/Windows files: line ending is CRLF: convert to LF
2319 //
2320 // - Legacy Mac files: line ending is CR: convert to LF
2321 //
2322 // - Garbled files: random line endings: convert gracefully
2323 // lonely CR, lonely LF, CRLF: convert to LF
2324 //
2325 // @param src The multi-line string to convert
2326 // @param dst The converted string is appended to this string
2327 // @param auto_end_last_line Automatically terminate the last line
2328 //
2329 // Limitations:
2330 //
2331 // This does not do the right thing for CRCRLF files created by
2332 // broken programs that do another Unix->DOS conversion on files
2333 // that are already in CRLF format. For this, a two-pass approach
2334 // brute-force would be needed that
2335 //
2336 // (1) determines the presence of LF (first one is ok)
2337 // (2) if yes, removes any CR, else convert every CR to LF
2338
CleanStringLineEndings(const string & src,string * dst,bool auto_end_last_line)2339 void CleanStringLineEndings(const string &src, string *dst,
2340 bool auto_end_last_line) {
2341 if (dst->empty()) {
2342 dst->append(src);
2343 CleanStringLineEndings(dst, auto_end_last_line);
2344 } else {
2345 string tmp = src;
2346 CleanStringLineEndings(&tmp, auto_end_last_line);
2347 dst->append(tmp);
2348 }
2349 }
2350
CleanStringLineEndings(string * str,bool auto_end_last_line)2351 void CleanStringLineEndings(string *str, bool auto_end_last_line) {
2352 ptrdiff_t output_pos = 0;
2353 bool r_seen = false;
2354 ptrdiff_t len = str->size();
2355
2356 char *p = &(*str)[0];
2357
2358 for (ptrdiff_t input_pos = 0; input_pos < len;) {
2359 if (!r_seen && input_pos + 8 < len) {
2360 uint64_t v = GOOGLE_UNALIGNED_LOAD64(p + input_pos);
2361 // Loop over groups of 8 bytes at a time until we come across
2362 // a word that has a byte whose value is less than or equal to
2363 // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
2364 //
2365 // We use a has_less macro that quickly tests a whole 64-bit
2366 // word to see if any of the bytes has a value < N.
2367 //
2368 // For more details, see:
2369 // http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
2370 #define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128)
2371 if (!has_less(v, '\r' + 1)) {
2372 #undef has_less
2373 // No byte in this word has a value that could be a \r or a \n
2374 if (output_pos != input_pos) {
2375 GOOGLE_UNALIGNED_STORE64(p + output_pos, v);
2376 }
2377 input_pos += 8;
2378 output_pos += 8;
2379 continue;
2380 }
2381 }
2382 string::const_reference in = p[input_pos];
2383 if (in == '\r') {
2384 if (r_seen) p[output_pos++] = '\n';
2385 r_seen = true;
2386 } else if (in == '\n') {
2387 if (input_pos != output_pos)
2388 p[output_pos++] = '\n';
2389 else
2390 output_pos++;
2391 r_seen = false;
2392 } else {
2393 if (r_seen) p[output_pos++] = '\n';
2394 r_seen = false;
2395 if (input_pos != output_pos)
2396 p[output_pos++] = in;
2397 else
2398 output_pos++;
2399 }
2400 input_pos++;
2401 }
2402 if (r_seen ||
2403 (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) {
2404 str->resize(output_pos + 1);
2405 str->operator[](output_pos) = '\n';
2406 } else if (output_pos < len) {
2407 str->resize(output_pos);
2408 }
2409 }
2410
2411 namespace internal {
2412
2413 // ----------------------------------------------------------------------
2414 // NoLocaleStrtod()
2415 // This code will make you cry.
2416 // ----------------------------------------------------------------------
2417
2418 namespace {
2419
2420 // Returns a string identical to *input except that the character pointed to
2421 // by radix_pos (which should be '.') is replaced with the locale-specific
2422 // radix character.
LocalizeRadix(const char * input,const char * radix_pos)2423 std::string LocalizeRadix(const char *input, const char *radix_pos) {
2424 // Determine the locale-specific radix character by calling sprintf() to
2425 // print the number 1.5, then stripping off the digits. As far as I can
2426 // tell, this is the only portable, thread-safe way to get the C library
2427 // to divuldge the locale's radix character. No, localeconv() is NOT
2428 // thread-safe.
2429 char temp[16];
2430 int size = snprintf(temp, sizeof(temp), "%.1f", 1.5);
2431 GOOGLE_CHECK_EQ(temp[0], '1');
2432 GOOGLE_CHECK_EQ(temp[size - 1], '5');
2433 GOOGLE_CHECK_LE(size, 6);
2434
2435 // Now replace the '.' in the input with it.
2436 std::string result;
2437 result.reserve(strlen(input) + size - 3);
2438 result.append(input, radix_pos);
2439 result.append(temp + 1, size - 2);
2440 result.append(radix_pos + 1);
2441 return result;
2442 }
2443
2444 } // namespace
2445
NoLocaleStrtod(const char * str,char ** endptr)2446 double NoLocaleStrtod(const char *str, char **endptr) {
2447 // We cannot simply set the locale to "C" temporarily with setlocale()
2448 // as this is not thread-safe. Instead, we try to parse in the current
2449 // locale first. If parsing stops at a '.' character, then this is a
2450 // pretty good hint that we're actually in some other locale in which
2451 // '.' is not the radix character.
2452
2453 char *temp_endptr;
2454 double result = strtod(str, &temp_endptr);
2455 if (endptr != NULL) *endptr = temp_endptr;
2456 if (*temp_endptr != '.') return result;
2457
2458 // Parsing halted on a '.'. Perhaps we're in a different locale? Let's
2459 // try to replace the '.' with a locale-specific radix character and
2460 // try again.
2461 std::string localized = LocalizeRadix(str, temp_endptr);
2462 const char *localized_cstr = localized.c_str();
2463 char *localized_endptr;
2464 result = strtod(localized_cstr, &localized_endptr);
2465 if ((localized_endptr - localized_cstr) > (temp_endptr - str)) {
2466 // This attempt got further, so replacing the decimal must have helped.
2467 // Update endptr to point at the right location.
2468 if (endptr != NULL) {
2469 // size_diff is non-zero if the localized radix has multiple bytes.
2470 int size_diff = localized.size() - strlen(str);
2471 // const_cast is necessary to match the strtod() interface.
2472 *endptr = const_cast<char *>(
2473 str + (localized_endptr - localized_cstr - size_diff));
2474 }
2475 }
2476
2477 return result;
2478 }
2479
2480 } // namespace internal
2481
2482 } // namespace protobuf
2483 } // namespace google
2484