1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 // from google3/strings/strutil.cc
32
33 #include <google/protobuf/stubs/strutil.h>
34
35 #include <errno.h>
36 #include <float.h> // FLT_DIG and DBL_DIG
37 #include <limits.h>
38 #include <stdio.h>
39 #include <cmath>
40 #include <iterator>
41 #include <limits>
42
43 #include <google/protobuf/stubs/logging.h>
44 #include <google/protobuf/stubs/stl_util.h>
45
46 #ifdef _WIN32
47 // MSVC has only _snprintf, not snprintf.
48 //
49 // MinGW has both snprintf and _snprintf, but they appear to be different
50 // functions. The former is buggy. When invoked like so:
51 // char buffer[32];
52 // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
53 // it prints "1.23000e+10". This is plainly wrong: %g should never print
54 // trailing zeros after the decimal point. For some reason this bug only
55 // occurs with some input values, not all. In any case, _snprintf does the
56 // right thing, so we use it.
57 #define snprintf _snprintf
58 #endif
59
60 namespace google {
61 namespace protobuf {
62
63 // These are defined as macros on some platforms. #undef them so that we can
64 // redefine them.
65 #undef isxdigit
66 #undef isprint
67
68 // The definitions of these in ctype.h change based on locale. Since our
69 // string manipulation is all in relation to the protocol buffer and C++
70 // languages, we always want to use the C locale. So, we re-define these
71 // exactly as we want them.
isxdigit(char c)72 inline bool isxdigit(char c) {
73 return ('0' <= c && c <= '9') ||
74 ('a' <= c && c <= 'f') ||
75 ('A' <= c && c <= 'F');
76 }
77
isprint(char c)78 inline bool isprint(char c) {
79 return c >= 0x20 && c <= 0x7E;
80 }
81
82 // ----------------------------------------------------------------------
83 // ReplaceCharacters
84 // Replaces any occurrence of the character 'remove' (or the characters
85 // in 'remove') with the character 'replacewith'.
86 // ----------------------------------------------------------------------
ReplaceCharacters(std::string * s,const char * remove,char replacewith)87 void ReplaceCharacters(std::string *s, const char *remove, char replacewith) {
88 const char *str_start = s->c_str();
89 const char *str = str_start;
90 for (str = strpbrk(str, remove);
91 str != nullptr;
92 str = strpbrk(str + 1, remove)) {
93 (*s)[str - str_start] = replacewith;
94 }
95 }
96
StripWhitespace(std::string * str)97 void StripWhitespace(std::string *str) {
98 int str_length = str->length();
99
100 // Strip off leading whitespace.
101 int first = 0;
102 while (first < str_length && ascii_isspace(str->at(first))) {
103 ++first;
104 }
105 // If entire string is white space.
106 if (first == str_length) {
107 str->clear();
108 return;
109 }
110 if (first > 0) {
111 str->erase(0, first);
112 str_length -= first;
113 }
114
115 // Strip off trailing whitespace.
116 int last = str_length - 1;
117 while (last >= 0 && ascii_isspace(str->at(last))) {
118 --last;
119 }
120 if (last != (str_length - 1) && last >= 0) {
121 str->erase(last + 1, std::string::npos);
122 }
123 }
124
125 // ----------------------------------------------------------------------
126 // StringReplace()
127 // Replace the "old" pattern with the "new" pattern in a string,
128 // and append the result to "res". If replace_all is false,
129 // it only replaces the first instance of "old."
130 // ----------------------------------------------------------------------
131
StringReplace(const std::string & s,const std::string & oldsub,const std::string & newsub,bool replace_all,std::string * res)132 void StringReplace(const std::string &s, const std::string &oldsub,
133 const std::string &newsub, bool replace_all,
134 std::string *res) {
135 if (oldsub.empty()) {
136 res->append(s); // if empty, append the given string.
137 return;
138 }
139
140 std::string::size_type start_pos = 0;
141 std::string::size_type pos;
142 do {
143 pos = s.find(oldsub, start_pos);
144 if (pos == std::string::npos) {
145 break;
146 }
147 res->append(s, start_pos, pos - start_pos);
148 res->append(newsub);
149 start_pos = pos + oldsub.size(); // start searching again after the "old"
150 } while (replace_all);
151 res->append(s, start_pos, s.length() - start_pos);
152 }
153
154 // ----------------------------------------------------------------------
155 // StringReplace()
156 // Give me a string and two patterns "old" and "new", and I replace
157 // the first instance of "old" in the string with "new", if it
158 // exists. If "global" is true; call this repeatedly until it
159 // fails. RETURN a new string, regardless of whether the replacement
160 // happened or not.
161 // ----------------------------------------------------------------------
162
StringReplace(const std::string & s,const std::string & oldsub,const std::string & newsub,bool replace_all)163 std::string StringReplace(const std::string &s, const std::string &oldsub,
164 const std::string &newsub, bool replace_all) {
165 std::string ret;
166 StringReplace(s, oldsub, newsub, replace_all, &ret);
167 return ret;
168 }
169
170 // ----------------------------------------------------------------------
171 // SplitStringUsing()
172 // Split a string using a character delimiter. Append the components
173 // to 'result'.
174 //
175 // Note: For multi-character delimiters, this routine will split on *ANY* of
176 // the characters in the string, not the entire string as a single delimiter.
177 // ----------------------------------------------------------------------
178 template <typename ITR>
SplitStringToIteratorUsing(StringPiece full,const char * delim,ITR & result)179 static inline void SplitStringToIteratorUsing(StringPiece full,
180 const char *delim, ITR &result) {
181 // Optimize the common case where delim is a single character.
182 if (delim[0] != '\0' && delim[1] == '\0') {
183 char c = delim[0];
184 const char* p = full.data();
185 const char* end = p + full.size();
186 while (p != end) {
187 if (*p == c) {
188 ++p;
189 } else {
190 const char* start = p;
191 while (++p != end && *p != c);
192 *result++ = std::string(start, p - start);
193 }
194 }
195 return;
196 }
197
198 std::string::size_type begin_index, end_index;
199 begin_index = full.find_first_not_of(delim);
200 while (begin_index != std::string::npos) {
201 end_index = full.find_first_of(delim, begin_index);
202 if (end_index == std::string::npos) {
203 *result++ = std::string(full.substr(begin_index));
204 return;
205 }
206 *result++ =
207 std::string(full.substr(begin_index, (end_index - begin_index)));
208 begin_index = full.find_first_not_of(delim, end_index);
209 }
210 }
211
SplitStringUsing(StringPiece full,const char * delim,std::vector<std::string> * result)212 void SplitStringUsing(StringPiece full, const char *delim,
213 std::vector<std::string> *result) {
214 std::back_insert_iterator<std::vector<std::string> > it(*result);
215 SplitStringToIteratorUsing(full, delim, it);
216 }
217
218 // Split a string using a character delimiter. Append the components
219 // to 'result'. If there are consecutive delimiters, this function
220 // will return corresponding empty strings. The string is split into
221 // at most the specified number of pieces greedily. This means that the
222 // last piece may possibly be split further. To split into as many pieces
223 // as possible, specify 0 as the number of pieces.
224 //
225 // If "full" is the empty string, yields an empty string as the only value.
226 //
227 // If "pieces" is negative for some reason, it returns the whole string
228 // ----------------------------------------------------------------------
229 template <typename ITR>
SplitStringToIteratorAllowEmpty(StringPiece full,const char * delim,int pieces,ITR & result)230 static inline void SplitStringToIteratorAllowEmpty(StringPiece full,
231 const char *delim,
232 int pieces, ITR &result) {
233 std::string::size_type begin_index, end_index;
234 begin_index = 0;
235
236 for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
237 end_index = full.find_first_of(delim, begin_index);
238 if (end_index == std::string::npos) {
239 *result++ = std::string(full.substr(begin_index));
240 return;
241 }
242 *result++ =
243 std::string(full.substr(begin_index, (end_index - begin_index)));
244 begin_index = end_index + 1;
245 }
246 *result++ = std::string(full.substr(begin_index));
247 }
248
SplitStringAllowEmpty(StringPiece full,const char * delim,std::vector<std::string> * result)249 void SplitStringAllowEmpty(StringPiece full, const char *delim,
250 std::vector<std::string> *result) {
251 std::back_insert_iterator<std::vector<std::string> > it(*result);
252 SplitStringToIteratorAllowEmpty(full, delim, 0, it);
253 }
254
255 // ----------------------------------------------------------------------
256 // JoinStrings()
257 // This merges a vector of string components with delim inserted
258 // as separaters between components.
259 //
260 // ----------------------------------------------------------------------
261 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,std::string * result)262 static void JoinStringsIterator(const ITERATOR &start, const ITERATOR &end,
263 const char *delim, std::string *result) {
264 GOOGLE_CHECK(result != nullptr);
265 result->clear();
266 int delim_length = strlen(delim);
267
268 // Precompute resulting length so we can reserve() memory in one shot.
269 int length = 0;
270 for (ITERATOR iter = start; iter != end; ++iter) {
271 if (iter != start) {
272 length += delim_length;
273 }
274 length += iter->size();
275 }
276 result->reserve(length);
277
278 // Now combine everything.
279 for (ITERATOR iter = start; iter != end; ++iter) {
280 if (iter != start) {
281 result->append(delim, delim_length);
282 }
283 result->append(iter->data(), iter->size());
284 }
285 }
286
JoinStrings(const std::vector<std::string> & components,const char * delim,std::string * result)287 void JoinStrings(const std::vector<std::string> &components, const char *delim,
288 std::string *result) {
289 JoinStringsIterator(components.begin(), components.end(), delim, result);
290 }
291
292 // ----------------------------------------------------------------------
293 // UnescapeCEscapeSequences()
294 // This does all the unescaping that C does: \ooo, \r, \n, etc
295 // Returns length of resulting string.
296 // The implementation of \x parses any positive number of hex digits,
297 // but it is an error if the value requires more than 8 bits, and the
298 // result is truncated to 8 bits.
299 //
300 // The second call stores its errors in a supplied string vector.
301 // If the string vector pointer is nullptr, it reports the errors with LOG().
302 // ----------------------------------------------------------------------
303
304 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
305
306 // Protocol buffers doesn't ever care about errors, but I don't want to remove
307 // the code.
308 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
309
UnescapeCEscapeSequences(const char * source,char * dest)310 int UnescapeCEscapeSequences(const char* source, char* dest) {
311 return UnescapeCEscapeSequences(source, dest, nullptr);
312 }
313
UnescapeCEscapeSequences(const char * source,char * dest,std::vector<std::string> * errors)314 int UnescapeCEscapeSequences(const char *source, char *dest,
315 std::vector<std::string> *errors) {
316 GOOGLE_DCHECK(errors == nullptr) << "Error reporting not implemented.";
317
318 char* d = dest;
319 const char* p = source;
320
321 // Small optimization for case where source = dest and there's no escaping
322 while ( p == d && *p != '\0' && *p != '\\' )
323 p++, d++;
324
325 while (*p != '\0') {
326 if (*p != '\\') {
327 *d++ = *p++;
328 } else {
329 switch ( *++p ) { // skip past the '\\'
330 case '\0':
331 LOG_STRING(ERROR, errors) << "String cannot end with \\";
332 *d = '\0';
333 return d - dest; // we're done with p
334 case 'a': *d++ = '\a'; break;
335 case 'b': *d++ = '\b'; break;
336 case 'f': *d++ = '\f'; break;
337 case 'n': *d++ = '\n'; break;
338 case 'r': *d++ = '\r'; break;
339 case 't': *d++ = '\t'; break;
340 case 'v': *d++ = '\v'; break;
341 case '\\': *d++ = '\\'; break;
342 case '?': *d++ = '\?'; break; // \? Who knew?
343 case '\'': *d++ = '\''; break;
344 case '"': *d++ = '\"'; break;
345 case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits
346 case '4': case '5': case '6': case '7': {
347 char ch = *p - '0';
348 if ( IS_OCTAL_DIGIT(p[1]) )
349 ch = ch * 8 + *++p - '0';
350 if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice
351 ch = ch * 8 + *++p - '0'; // now points at last digit
352 *d++ = ch;
353 break;
354 }
355 case 'x': case 'X': {
356 if (!isxdigit(p[1])) {
357 if (p[1] == '\0') {
358 LOG_STRING(ERROR, errors) << "String cannot end with \\x";
359 } else {
360 LOG_STRING(ERROR, errors) <<
361 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
362 }
363 break;
364 }
365 unsigned int ch = 0;
366 const char *hex_start = p;
367 while (isxdigit(p[1])) // arbitrarily many hex digits
368 ch = (ch << 4) + hex_digit_to_int(*++p);
369 if (ch > 0xFF)
370 LOG_STRING(ERROR, errors)
371 << "Value of "
372 << "\\" << std::string(hex_start, p + 1 - hex_start)
373 << " exceeds 8 bits";
374 *d++ = ch;
375 break;
376 }
377 #if 0 // TODO(kenton): Support \u and \U? Requires runetochar().
378 case 'u': {
379 // \uhhhh => convert 4 hex digits to UTF-8
380 char32 rune = 0;
381 const char *hex_start = p;
382 for (int i = 0; i < 4; ++i) {
383 if (isxdigit(p[1])) { // Look one char ahead.
384 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
385 } else {
386 LOG_STRING(ERROR, errors)
387 << "\\u must be followed by 4 hex digits: \\"
388 << std::string(hex_start, p+1-hex_start);
389 break;
390 }
391 }
392 d += runetochar(d, &rune);
393 break;
394 }
395 case 'U': {
396 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
397 char32 rune = 0;
398 const char *hex_start = p;
399 for (int i = 0; i < 8; ++i) {
400 if (isxdigit(p[1])) { // Look one char ahead.
401 // Don't change rune until we're sure this
402 // is within the Unicode limit, but do advance p.
403 char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
404 if (newrune > 0x10FFFF) {
405 LOG_STRING(ERROR, errors)
406 << "Value of \\"
407 << std::string(hex_start, p + 1 - hex_start)
408 << " exceeds Unicode limit (0x10FFFF)";
409 break;
410 } else {
411 rune = newrune;
412 }
413 } else {
414 LOG_STRING(ERROR, errors)
415 << "\\U must be followed by 8 hex digits: \\"
416 << std::string(hex_start, p+1-hex_start);
417 break;
418 }
419 }
420 d += runetochar(d, &rune);
421 break;
422 }
423 #endif
424 default:
425 LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
426 }
427 p++; // read past letter we escaped
428 }
429 }
430 *d = '\0';
431 return d - dest;
432 }
433
434 // ----------------------------------------------------------------------
435 // UnescapeCEscapeString()
436 // This does the same thing as UnescapeCEscapeSequences, but creates
437 // a new string. The caller does not need to worry about allocating
438 // a dest buffer. This should be used for non performance critical
439 // tasks such as printing debug messages. It is safe for src and dest
440 // to be the same.
441 //
442 // The second call stores its errors in a supplied string vector.
443 // If the string vector pointer is nullptr, it reports the errors with LOG().
444 //
445 // In the first and second calls, the length of dest is returned. In the
446 // the third call, the new string is returned.
447 // ----------------------------------------------------------------------
UnescapeCEscapeString(const std::string & src,std::string * dest)448 int UnescapeCEscapeString(const std::string &src, std::string *dest) {
449 return UnescapeCEscapeString(src, dest, nullptr);
450 }
451
UnescapeCEscapeString(const std::string & src,std::string * dest,std::vector<std::string> * errors)452 int UnescapeCEscapeString(const std::string &src, std::string *dest,
453 std::vector<std::string> *errors) {
454 std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
455 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
456 GOOGLE_CHECK(dest);
457 dest->assign(unescaped.get(), len);
458 return len;
459 }
460
UnescapeCEscapeString(const std::string & src)461 std::string UnescapeCEscapeString(const std::string &src) {
462 std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
463 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
464 return std::string(unescaped.get(), len);
465 }
466
467 // ----------------------------------------------------------------------
468 // CEscapeString()
469 // CHexEscapeString()
470 // Copies 'src' to 'dest', escaping dangerous characters using
471 // C-style escape sequences. This is very useful for preparing query
472 // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
473 // hexadecimal rather than octal sequences.
474 // Returns the number of bytes written to 'dest' (not including the \0)
475 // or -1 if there was insufficient space.
476 //
477 // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
478 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)479 int CEscapeInternal(const char* src, int src_len, char* dest,
480 int dest_len, bool use_hex, bool utf8_safe) {
481 const char* src_end = src + src_len;
482 int used = 0;
483 bool last_hex_escape = false; // true if last output char was \xNN
484
485 for (; src < src_end; src++) {
486 if (dest_len - used < 2) // Need space for two letter escape
487 return -1;
488
489 bool is_hex_escape = false;
490 switch (*src) {
491 case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
492 case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
493 case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
494 case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
495 case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
496 case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
497 default:
498 // Note that if we emit \xNN and the src character after that is a hex
499 // digit then that digit must be escaped too to prevent it being
500 // interpreted as part of the character code by C.
501 if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
502 (!isprint(*src) ||
503 (last_hex_escape && isxdigit(*src)))) {
504 if (dest_len - used < 4) // need space for 4 letter escape
505 return -1;
506 sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
507 static_cast<uint8>(*src));
508 is_hex_escape = use_hex;
509 used += 4;
510 } else {
511 dest[used++] = *src; break;
512 }
513 }
514 last_hex_escape = is_hex_escape;
515 }
516
517 if (dest_len - used < 1) // make sure that there is room for \0
518 return -1;
519
520 dest[used] = '\0'; // doesn't count towards return value though
521 return used;
522 }
523
524 // Calculates the length of the C-style escaped version of 'src'.
525 // Assumes that non-printable characters are escaped using octal sequences, and
526 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)527 static inline size_t CEscapedLength(StringPiece src) {
528 static char c_escaped_len[256] = {
529 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
530 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
531 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
532 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
537 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
538 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
539 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
540 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
541 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
542 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
543 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
544 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
545 };
546
547 size_t escaped_len = 0;
548 for (int i = 0; i < src.size(); ++i) {
549 unsigned char c = static_cast<unsigned char>(src[i]);
550 escaped_len += c_escaped_len[c];
551 }
552 return escaped_len;
553 }
554
555 // ----------------------------------------------------------------------
556 // Escapes 'src' using C-style escape sequences, and appends the escaped string
557 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
558 // the required space using a lookup table, and also does not do any special
559 // handling for Hex or UTF-8 characters.
560 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,std::string * dest)561 void CEscapeAndAppend(StringPiece src, std::string *dest) {
562 size_t escaped_len = CEscapedLength(src);
563 if (escaped_len == src.size()) {
564 dest->append(src.data(), src.size());
565 return;
566 }
567
568 size_t cur_dest_len = dest->size();
569 dest->resize(cur_dest_len + escaped_len);
570 char* append_ptr = &(*dest)[cur_dest_len];
571
572 for (int i = 0; i < src.size(); ++i) {
573 unsigned char c = static_cast<unsigned char>(src[i]);
574 switch (c) {
575 case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
576 case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
577 case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
578 case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
579 case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
580 case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
581 default:
582 if (!isprint(c)) {
583 *append_ptr++ = '\\';
584 *append_ptr++ = '0' + c / 64;
585 *append_ptr++ = '0' + (c % 64) / 8;
586 *append_ptr++ = '0' + c % 8;
587 } else {
588 *append_ptr++ = c;
589 }
590 break;
591 }
592 }
593 }
594
CEscape(const std::string & src)595 std::string CEscape(const std::string &src) {
596 std::string dest;
597 CEscapeAndAppend(src, &dest);
598 return dest;
599 }
600
601 namespace strings {
602
Utf8SafeCEscape(const std::string & src)603 std::string Utf8SafeCEscape(const std::string &src) {
604 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
605 std::unique_ptr<char[]> dest(new char[dest_length]);
606 const int len = CEscapeInternal(src.data(), src.size(),
607 dest.get(), dest_length, false, true);
608 GOOGLE_DCHECK_GE(len, 0);
609 return std::string(dest.get(), len);
610 }
611
CHexEscape(const std::string & src)612 std::string CHexEscape(const std::string &src) {
613 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
614 std::unique_ptr<char[]> dest(new char[dest_length]);
615 const int len = CEscapeInternal(src.data(), src.size(),
616 dest.get(), dest_length, true, false);
617 GOOGLE_DCHECK_GE(len, 0);
618 return std::string(dest.get(), len);
619 }
620
621 } // namespace strings
622
623 // ----------------------------------------------------------------------
624 // strto32_adaptor()
625 // strtou32_adaptor()
626 // Implementation of strto[u]l replacements that have identical
627 // overflow and underflow characteristics for both ILP-32 and LP-64
628 // platforms, including errno preservation in error-free calls.
629 // ----------------------------------------------------------------------
630
strto32_adaptor(const char * nptr,char ** endptr,int base)631 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
632 const int saved_errno = errno;
633 errno = 0;
634 const long result = strtol(nptr, endptr, base);
635 if (errno == ERANGE && result == LONG_MIN) {
636 return kint32min;
637 } else if (errno == ERANGE && result == LONG_MAX) {
638 return kint32max;
639 } else if (errno == 0 && result < kint32min) {
640 errno = ERANGE;
641 return kint32min;
642 } else if (errno == 0 && result > kint32max) {
643 errno = ERANGE;
644 return kint32max;
645 }
646 if (errno == 0)
647 errno = saved_errno;
648 return static_cast<int32>(result);
649 }
650
strtou32_adaptor(const char * nptr,char ** endptr,int base)651 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
652 const int saved_errno = errno;
653 errno = 0;
654 const unsigned long result = strtoul(nptr, endptr, base);
655 if (errno == ERANGE && result == ULONG_MAX) {
656 return kuint32max;
657 } else if (errno == 0 && result > kuint32max) {
658 errno = ERANGE;
659 return kuint32max;
660 }
661 if (errno == 0)
662 errno = saved_errno;
663 return static_cast<uint32>(result);
664 }
665
safe_parse_sign(std::string * text,bool * negative_ptr)666 inline bool safe_parse_sign(std::string *text /*inout*/,
667 bool *negative_ptr /*output*/) {
668 const char* start = text->data();
669 const char* end = start + text->size();
670
671 // Consume whitespace.
672 while (start < end && (start[0] == ' ')) {
673 ++start;
674 }
675 while (start < end && (end[-1] == ' ')) {
676 --end;
677 }
678 if (start >= end) {
679 return false;
680 }
681
682 // Consume sign.
683 *negative_ptr = (start[0] == '-');
684 if (*negative_ptr || start[0] == '+') {
685 ++start;
686 if (start >= end) {
687 return false;
688 }
689 }
690 *text = text->substr(start - text->data(), end - start);
691 return true;
692 }
693
694 template <typename IntType>
safe_parse_positive_int(std::string text,IntType * value_p)695 bool safe_parse_positive_int(std::string text, IntType *value_p) {
696 int base = 10;
697 IntType value = 0;
698 const IntType vmax = std::numeric_limits<IntType>::max();
699 assert(vmax > 0);
700 assert(vmax >= base);
701 const IntType vmax_over_base = vmax / base;
702 const char* start = text.data();
703 const char* end = start + text.size();
704 // loop over digits
705 for (; start < end; ++start) {
706 unsigned char c = static_cast<unsigned char>(start[0]);
707 int digit = c - '0';
708 if (digit >= base || digit < 0) {
709 *value_p = value;
710 return false;
711 }
712 if (value > vmax_over_base) {
713 *value_p = vmax;
714 return false;
715 }
716 value *= base;
717 if (value > vmax - digit) {
718 *value_p = vmax;
719 return false;
720 }
721 value += digit;
722 }
723 *value_p = value;
724 return true;
725 }
726
727 template <typename IntType>
safe_parse_negative_int(const std::string & text,IntType * value_p)728 bool safe_parse_negative_int(const std::string &text, IntType *value_p) {
729 int base = 10;
730 IntType value = 0;
731 const IntType vmin = std::numeric_limits<IntType>::min();
732 assert(vmin < 0);
733 assert(vmin <= 0 - base);
734 IntType vmin_over_base = vmin / base;
735 // 2003 c++ standard [expr.mul]
736 // "... the sign of the remainder is implementation-defined."
737 // Although (vmin/base)*base + vmin%base is always vmin.
738 // 2011 c++ standard tightens the spec but we cannot rely on it.
739 if (vmin % base > 0) {
740 vmin_over_base += 1;
741 }
742 const char* start = text.data();
743 const char* end = start + text.size();
744 // loop over digits
745 for (; start < end; ++start) {
746 unsigned char c = static_cast<unsigned char>(start[0]);
747 int digit = c - '0';
748 if (digit >= base || digit < 0) {
749 *value_p = value;
750 return false;
751 }
752 if (value < vmin_over_base) {
753 *value_p = vmin;
754 return false;
755 }
756 value *= base;
757 if (value < vmin + digit) {
758 *value_p = vmin;
759 return false;
760 }
761 value -= digit;
762 }
763 *value_p = value;
764 return true;
765 }
766
767 template <typename IntType>
safe_int_internal(std::string text,IntType * value_p)768 bool safe_int_internal(std::string text, IntType *value_p) {
769 *value_p = 0;
770 bool negative;
771 if (!safe_parse_sign(&text, &negative)) {
772 return false;
773 }
774 if (!negative) {
775 return safe_parse_positive_int(text, value_p);
776 } else {
777 return safe_parse_negative_int(text, value_p);
778 }
779 }
780
781 template <typename IntType>
safe_uint_internal(std::string text,IntType * value_p)782 bool safe_uint_internal(std::string text, IntType *value_p) {
783 *value_p = 0;
784 bool negative;
785 if (!safe_parse_sign(&text, &negative) || negative) {
786 return false;
787 }
788 return safe_parse_positive_int(text, value_p);
789 }
790
791 // ----------------------------------------------------------------------
792 // FastIntToBuffer()
793 // FastInt64ToBuffer()
794 // FastHexToBuffer()
795 // FastHex64ToBuffer()
796 // FastHex32ToBuffer()
797 // ----------------------------------------------------------------------
798
799 // Offset into buffer where FastInt64ToBuffer places the end of string
800 // null character. Also used by FastInt64ToBufferLeft.
801 static const int kFastInt64ToBufferOffset = 21;
802
FastInt64ToBuffer(int64 i,char * buffer)803 char *FastInt64ToBuffer(int64 i, char* buffer) {
804 // We could collapse the positive and negative sections, but that
805 // would be slightly slower for positive numbers...
806 // 22 bytes is enough to store -2**64, -18446744073709551616.
807 char* p = buffer + kFastInt64ToBufferOffset;
808 *p-- = '\0';
809 if (i >= 0) {
810 do {
811 *p-- = '0' + i % 10;
812 i /= 10;
813 } while (i > 0);
814 return p + 1;
815 } else {
816 // On different platforms, % and / have different behaviors for
817 // negative numbers, so we need to jump through hoops to make sure
818 // we don't divide negative numbers.
819 if (i > -10) {
820 i = -i;
821 *p-- = '0' + i;
822 *p = '-';
823 return p;
824 } else {
825 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
826 i = i + 10;
827 i = -i;
828 *p-- = '0' + i % 10;
829 // Undo what we did a moment ago
830 i = i / 10 + 1;
831 do {
832 *p-- = '0' + i % 10;
833 i /= 10;
834 } while (i > 0);
835 *p = '-';
836 return p;
837 }
838 }
839 }
840
841 // Offset into buffer where FastInt32ToBuffer places the end of string
842 // null character. Also used by FastInt32ToBufferLeft
843 static const int kFastInt32ToBufferOffset = 11;
844
845 // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the
846 // compiler to generate 32 bit arithmetic instructions. It's much faster, at
847 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)848 char *FastInt32ToBuffer(int32 i, char* buffer) {
849 // We could collapse the positive and negative sections, but that
850 // would be slightly slower for positive numbers...
851 // 12 bytes is enough to store -2**32, -4294967296.
852 char* p = buffer + kFastInt32ToBufferOffset;
853 *p-- = '\0';
854 if (i >= 0) {
855 do {
856 *p-- = '0' + i % 10;
857 i /= 10;
858 } while (i > 0);
859 return p + 1;
860 } else {
861 // On different platforms, % and / have different behaviors for
862 // negative numbers, so we need to jump through hoops to make sure
863 // we don't divide negative numbers.
864 if (i > -10) {
865 i = -i;
866 *p-- = '0' + i;
867 *p = '-';
868 return p;
869 } else {
870 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
871 i = i + 10;
872 i = -i;
873 *p-- = '0' + i % 10;
874 // Undo what we did a moment ago
875 i = i / 10 + 1;
876 do {
877 *p-- = '0' + i % 10;
878 i /= 10;
879 } while (i > 0);
880 *p = '-';
881 return p;
882 }
883 }
884 }
885
FastHexToBuffer(int i,char * buffer)886 char *FastHexToBuffer(int i, char* buffer) {
887 GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
888
889 static const char *hexdigits = "0123456789abcdef";
890 char *p = buffer + 21;
891 *p-- = '\0';
892 do {
893 *p-- = hexdigits[i & 15]; // mod by 16
894 i >>= 4; // divide by 16
895 } while (i > 0);
896 return p + 1;
897 }
898
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)899 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
900 static const char *hexdigits = "0123456789abcdef";
901 buffer[num_byte] = '\0';
902 for (int i = num_byte - 1; i >= 0; i--) {
903 #ifdef _M_X64
904 // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
905 // block. Given that the uint32 cast was to improve performance on 32-bit
906 // platforms, we use 64-bit '&' directly.
907 buffer[i] = hexdigits[value & 0xf];
908 #else
909 buffer[i] = hexdigits[uint32(value) & 0xf];
910 #endif
911 value >>= 4;
912 }
913 return buffer;
914 }
915
FastHex64ToBuffer(uint64 value,char * buffer)916 char *FastHex64ToBuffer(uint64 value, char* buffer) {
917 return InternalFastHexToBuffer(value, buffer, 16);
918 }
919
FastHex32ToBuffer(uint32 value,char * buffer)920 char *FastHex32ToBuffer(uint32 value, char* buffer) {
921 return InternalFastHexToBuffer(value, buffer, 8);
922 }
923
924 // ----------------------------------------------------------------------
925 // FastInt32ToBufferLeft()
926 // FastUInt32ToBufferLeft()
927 // FastInt64ToBufferLeft()
928 // FastUInt64ToBufferLeft()
929 //
930 // Like the Fast*ToBuffer() functions above, these are intended for speed.
931 // Unlike the Fast*ToBuffer() functions, however, these functions write
932 // their output to the beginning of the buffer (hence the name, as the
933 // output is left-aligned). The caller is responsible for ensuring that
934 // the buffer has enough space to hold the output.
935 //
936 // Returns a pointer to the end of the string (i.e. the null character
937 // terminating the string).
938 // ----------------------------------------------------------------------
939
940 static const char two_ASCII_digits[100][2] = {
941 {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
942 {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
943 {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
944 {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
945 {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
946 {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
947 {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
948 {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
949 {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
950 {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
951 {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
952 {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
953 {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
954 {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
955 {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
956 {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
957 {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
958 {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
959 {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
960 {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
961 };
962
FastUInt32ToBufferLeft(uint32 u,char * buffer)963 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
964 uint32 digits;
965 const char *ASCII_digits = nullptr;
966 // The idea of this implementation is to trim the number of divides to as few
967 // as possible by using multiplication and subtraction rather than mod (%),
968 // and by outputting two digits at a time rather than one.
969 // The huge-number case is first, in the hopes that the compiler will output
970 // that case in one branch-free block of code, and only output conditional
971 // branches into it from below.
972 if (u >= 1000000000) { // >= 1,000,000,000
973 digits = u / 100000000; // 100,000,000
974 ASCII_digits = two_ASCII_digits[digits];
975 buffer[0] = ASCII_digits[0];
976 buffer[1] = ASCII_digits[1];
977 buffer += 2;
978 sublt100_000_000:
979 u -= digits * 100000000; // 100,000,000
980 lt100_000_000:
981 digits = u / 1000000; // 1,000,000
982 ASCII_digits = two_ASCII_digits[digits];
983 buffer[0] = ASCII_digits[0];
984 buffer[1] = ASCII_digits[1];
985 buffer += 2;
986 sublt1_000_000:
987 u -= digits * 1000000; // 1,000,000
988 lt1_000_000:
989 digits = u / 10000; // 10,000
990 ASCII_digits = two_ASCII_digits[digits];
991 buffer[0] = ASCII_digits[0];
992 buffer[1] = ASCII_digits[1];
993 buffer += 2;
994 sublt10_000:
995 u -= digits * 10000; // 10,000
996 lt10_000:
997 digits = u / 100;
998 ASCII_digits = two_ASCII_digits[digits];
999 buffer[0] = ASCII_digits[0];
1000 buffer[1] = ASCII_digits[1];
1001 buffer += 2;
1002 sublt100:
1003 u -= digits * 100;
1004 lt100:
1005 digits = u;
1006 ASCII_digits = two_ASCII_digits[digits];
1007 buffer[0] = ASCII_digits[0];
1008 buffer[1] = ASCII_digits[1];
1009 buffer += 2;
1010 done:
1011 *buffer = 0;
1012 return buffer;
1013 }
1014
1015 if (u < 100) {
1016 digits = u;
1017 if (u >= 10) goto lt100;
1018 *buffer++ = '0' + digits;
1019 goto done;
1020 }
1021 if (u < 10000) { // 10,000
1022 if (u >= 1000) goto lt10_000;
1023 digits = u / 100;
1024 *buffer++ = '0' + digits;
1025 goto sublt100;
1026 }
1027 if (u < 1000000) { // 1,000,000
1028 if (u >= 100000) goto lt1_000_000;
1029 digits = u / 10000; // 10,000
1030 *buffer++ = '0' + digits;
1031 goto sublt10_000;
1032 }
1033 if (u < 100000000) { // 100,000,000
1034 if (u >= 10000000) goto lt100_000_000;
1035 digits = u / 1000000; // 1,000,000
1036 *buffer++ = '0' + digits;
1037 goto sublt1_000_000;
1038 }
1039 // we already know that u < 1,000,000,000
1040 digits = u / 100000000; // 100,000,000
1041 *buffer++ = '0' + digits;
1042 goto sublt100_000_000;
1043 }
1044
FastInt32ToBufferLeft(int32 i,char * buffer)1045 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1046 uint32 u = 0;
1047 if (i < 0) {
1048 *buffer++ = '-';
1049 u -= i;
1050 } else {
1051 u = i;
1052 }
1053 return FastUInt32ToBufferLeft(u, buffer);
1054 }
1055
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1056 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1057 int digits;
1058 const char *ASCII_digits = nullptr;
1059
1060 uint32 u = static_cast<uint32>(u64);
1061 if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1062
1063 uint64 top_11_digits = u64 / 1000000000;
1064 buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1065 u = u64 - (top_11_digits * 1000000000);
1066
1067 digits = u / 10000000; // 10,000,000
1068 GOOGLE_DCHECK_LT(digits, 100);
1069 ASCII_digits = two_ASCII_digits[digits];
1070 buffer[0] = ASCII_digits[0];
1071 buffer[1] = ASCII_digits[1];
1072 buffer += 2;
1073 u -= digits * 10000000; // 10,000,000
1074 digits = u / 100000; // 100,000
1075 ASCII_digits = two_ASCII_digits[digits];
1076 buffer[0] = ASCII_digits[0];
1077 buffer[1] = ASCII_digits[1];
1078 buffer += 2;
1079 u -= digits * 100000; // 100,000
1080 digits = u / 1000; // 1,000
1081 ASCII_digits = two_ASCII_digits[digits];
1082 buffer[0] = ASCII_digits[0];
1083 buffer[1] = ASCII_digits[1];
1084 buffer += 2;
1085 u -= digits * 1000; // 1,000
1086 digits = u / 10;
1087 ASCII_digits = two_ASCII_digits[digits];
1088 buffer[0] = ASCII_digits[0];
1089 buffer[1] = ASCII_digits[1];
1090 buffer += 2;
1091 u -= digits * 10;
1092 digits = u;
1093 *buffer++ = '0' + digits;
1094 *buffer = 0;
1095 return buffer;
1096 }
1097
FastInt64ToBufferLeft(int64 i,char * buffer)1098 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1099 uint64 u = 0;
1100 if (i < 0) {
1101 *buffer++ = '-';
1102 u -= i;
1103 } else {
1104 u = i;
1105 }
1106 return FastUInt64ToBufferLeft(u, buffer);
1107 }
1108
1109 // ----------------------------------------------------------------------
1110 // SimpleItoa()
1111 // Description: converts an integer to a string.
1112 //
1113 // Return value: string
1114 // ----------------------------------------------------------------------
1115
SimpleItoa(int i)1116 std::string SimpleItoa(int i) {
1117 char buffer[kFastToBufferSize];
1118 return (sizeof(i) == 4) ?
1119 FastInt32ToBuffer(i, buffer) :
1120 FastInt64ToBuffer(i, buffer);
1121 }
1122
SimpleItoa(unsigned int i)1123 std::string SimpleItoa(unsigned int i) {
1124 char buffer[kFastToBufferSize];
1125 return std::string(buffer, (sizeof(i) == 4)
1126 ? FastUInt32ToBufferLeft(i, buffer)
1127 : FastUInt64ToBufferLeft(i, buffer));
1128 }
1129
SimpleItoa(long i)1130 std::string SimpleItoa(long i) {
1131 char buffer[kFastToBufferSize];
1132 return (sizeof(i) == 4) ?
1133 FastInt32ToBuffer(i, buffer) :
1134 FastInt64ToBuffer(i, buffer);
1135 }
1136
SimpleItoa(unsigned long i)1137 std::string SimpleItoa(unsigned long i) {
1138 char buffer[kFastToBufferSize];
1139 return std::string(buffer, (sizeof(i) == 4)
1140 ? FastUInt32ToBufferLeft(i, buffer)
1141 : FastUInt64ToBufferLeft(i, buffer));
1142 }
1143
SimpleItoa(long long i)1144 std::string SimpleItoa(long long i) {
1145 char buffer[kFastToBufferSize];
1146 return (sizeof(i) == 4) ?
1147 FastInt32ToBuffer(i, buffer) :
1148 FastInt64ToBuffer(i, buffer);
1149 }
1150
SimpleItoa(unsigned long long i)1151 std::string SimpleItoa(unsigned long long i) {
1152 char buffer[kFastToBufferSize];
1153 return std::string(buffer, (sizeof(i) == 4)
1154 ? FastUInt32ToBufferLeft(i, buffer)
1155 : FastUInt64ToBufferLeft(i, buffer));
1156 }
1157
1158 // ----------------------------------------------------------------------
1159 // SimpleDtoa()
1160 // SimpleFtoa()
1161 // DoubleToBuffer()
1162 // FloatToBuffer()
1163 // We want to print the value without losing precision, but we also do
1164 // not want to print more digits than necessary. This turns out to be
1165 // trickier than it sounds. Numbers like 0.2 cannot be represented
1166 // exactly in binary. If we print 0.2 with a very large precision,
1167 // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1168 // On the other hand, if we set the precision too low, we lose
1169 // significant digits when printing numbers that actually need them.
1170 // It turns out there is no precision value that does the right thing
1171 // for all numbers.
1172 //
1173 // Our strategy is to first try printing with a precision that is never
1174 // over-precise, then parse the result with strtod() to see if it
1175 // matches. If not, we print again with a precision that will always
1176 // give a precise result, but may use more digits than necessary.
1177 //
1178 // An arguably better strategy would be to use the algorithm described
1179 // in "How to Print Floating-Point Numbers Accurately" by Steele &
1180 // White, e.g. as implemented by David M. Gay's dtoa(). It turns out,
1181 // however, that the following implementation is about as fast as
1182 // DMG's code. Furthermore, DMG's code locks mutexes, which means it
1183 // will not scale well on multi-core machines. DMG's code is slightly
1184 // more accurate (in that it will never use more digits than
1185 // necessary), but this is probably irrelevant for most users.
1186 //
1187 // Rob Pike and Ken Thompson also have an implementation of dtoa() in
1188 // third_party/fmt/fltfmt.cc. Their implementation is similar to this
1189 // one in that it makes guesses and then uses strtod() to check them.
1190 // Their implementation is faster because they use their own code to
1191 // generate the digits in the first place rather than use snprintf(),
1192 // thus avoiding format string parsing overhead. However, this makes
1193 // it considerably more complicated than the following implementation,
1194 // and it is embedded in a larger library. If speed turns out to be
1195 // an issue, we could re-implement this in terms of their
1196 // implementation.
1197 // ----------------------------------------------------------------------
1198
SimpleDtoa(double value)1199 std::string SimpleDtoa(double value) {
1200 char buffer[kDoubleToBufferSize];
1201 return DoubleToBuffer(value, buffer);
1202 }
1203
SimpleFtoa(float value)1204 std::string SimpleFtoa(float value) {
1205 char buffer[kFloatToBufferSize];
1206 return FloatToBuffer(value, buffer);
1207 }
1208
IsValidFloatChar(char c)1209 static inline bool IsValidFloatChar(char c) {
1210 return ('0' <= c && c <= '9') ||
1211 c == 'e' || c == 'E' ||
1212 c == '+' || c == '-';
1213 }
1214
DelocalizeRadix(char * buffer)1215 void DelocalizeRadix(char* buffer) {
1216 // Fast check: if the buffer has a normal decimal point, assume no
1217 // translation is needed.
1218 if (strchr(buffer, '.') != nullptr) return;
1219
1220 // Find the first unknown character.
1221 while (IsValidFloatChar(*buffer)) ++buffer;
1222
1223 if (*buffer == '\0') {
1224 // No radix character found.
1225 return;
1226 }
1227
1228 // We are now pointing at the locale-specific radix character. Replace it
1229 // with '.'.
1230 *buffer = '.';
1231 ++buffer;
1232
1233 if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1234 // It appears the radix was a multi-byte character. We need to remove the
1235 // extra bytes.
1236 char* target = buffer;
1237 do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1238 memmove(target, buffer, strlen(buffer) + 1);
1239 }
1240 }
1241
DoubleToBuffer(double value,char * buffer)1242 char* DoubleToBuffer(double value, char* buffer) {
1243 // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1244 // platforms these days. Just in case some system exists where DBL_DIG
1245 // is significantly larger -- and risks overflowing our buffer -- we have
1246 // this assert.
1247 GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1248
1249 if (value == std::numeric_limits<double>::infinity()) {
1250 strcpy(buffer, "inf");
1251 return buffer;
1252 } else if (value == -std::numeric_limits<double>::infinity()) {
1253 strcpy(buffer, "-inf");
1254 return buffer;
1255 } else if (std::isnan(value)) {
1256 strcpy(buffer, "nan");
1257 return buffer;
1258 }
1259
1260 int snprintf_result =
1261 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1262
1263 // The snprintf should never overflow because the buffer is significantly
1264 // larger than the precision we asked for.
1265 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1266
1267 // We need to make parsed_value volatile in order to force the compiler to
1268 // write it out to the stack. Otherwise, it may keep the value in a
1269 // register, and if it does that, it may keep it as a long double instead
1270 // of a double. This long double may have extra bits that make it compare
1271 // unequal to "value" even though it would be exactly equal if it were
1272 // truncated to a double.
1273 volatile double parsed_value = internal::NoLocaleStrtod(buffer, nullptr);
1274 if (parsed_value != value) {
1275 int snprintf_result =
1276 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1277
1278 // Should never overflow; see above.
1279 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1280 }
1281
1282 DelocalizeRadix(buffer);
1283 return buffer;
1284 }
1285
memcasecmp(const char * s1,const char * s2,size_t len)1286 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1287 const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1288 const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1289
1290 for ( int i = 0; i < len; i++ ) {
1291 const int diff =
1292 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1293 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1294 if (diff != 0) return diff;
1295 }
1296 return 0;
1297 }
1298
CaseEqual(StringPiece s1,StringPiece s2)1299 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1300 if (s1.size() != s2.size()) return false;
1301 return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1302 }
1303
safe_strtob(StringPiece str,bool * value)1304 bool safe_strtob(StringPiece str, bool* value) {
1305 GOOGLE_CHECK(value != nullptr) << "nullptr output boolean given.";
1306 if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1307 CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1308 CaseEqual(str, "1")) {
1309 *value = true;
1310 return true;
1311 }
1312 if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1313 CaseEqual(str, "no") || CaseEqual(str, "n") ||
1314 CaseEqual(str, "0")) {
1315 *value = false;
1316 return true;
1317 }
1318 return false;
1319 }
1320
safe_strtof(const char * str,float * value)1321 bool safe_strtof(const char* str, float* value) {
1322 char* endptr;
1323 errno = 0; // errno only gets set on errors
1324 #if defined(_WIN32) || defined (__hpux) // has no strtof()
1325 *value = internal::NoLocaleStrtod(str, &endptr);
1326 #else
1327 *value = strtof(str, &endptr);
1328 #endif
1329 return *str != 0 && *endptr == 0 && errno == 0;
1330 }
1331
safe_strtod(const char * str,double * value)1332 bool safe_strtod(const char* str, double* value) {
1333 char* endptr;
1334 *value = internal::NoLocaleStrtod(str, &endptr);
1335 if (endptr != str) {
1336 while (ascii_isspace(*endptr)) ++endptr;
1337 }
1338 // Ignore range errors from strtod. The values it
1339 // returns on underflow and overflow are the right
1340 // fallback in a robust setting.
1341 return *str != '\0' && *endptr == '\0';
1342 }
1343
safe_strto32(const std::string & str,int32 * value)1344 bool safe_strto32(const std::string &str, int32 *value) {
1345 return safe_int_internal(str, value);
1346 }
1347
safe_strtou32(const std::string & str,uint32 * value)1348 bool safe_strtou32(const std::string &str, uint32 *value) {
1349 return safe_uint_internal(str, value);
1350 }
1351
safe_strto64(const std::string & str,int64 * value)1352 bool safe_strto64(const std::string &str, int64 *value) {
1353 return safe_int_internal(str, value);
1354 }
1355
safe_strtou64(const std::string & str,uint64 * value)1356 bool safe_strtou64(const std::string &str, uint64 *value) {
1357 return safe_uint_internal(str, value);
1358 }
1359
FloatToBuffer(float value,char * buffer)1360 char* FloatToBuffer(float value, char* buffer) {
1361 // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1362 // platforms these days. Just in case some system exists where FLT_DIG
1363 // is significantly larger -- and risks overflowing our buffer -- we have
1364 // this assert.
1365 GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1366
1367 if (value == std::numeric_limits<double>::infinity()) {
1368 strcpy(buffer, "inf");
1369 return buffer;
1370 } else if (value == -std::numeric_limits<double>::infinity()) {
1371 strcpy(buffer, "-inf");
1372 return buffer;
1373 } else if (std::isnan(value)) {
1374 strcpy(buffer, "nan");
1375 return buffer;
1376 }
1377
1378 int snprintf_result =
1379 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1380
1381 // The snprintf should never overflow because the buffer is significantly
1382 // larger than the precision we asked for.
1383 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1384
1385 float parsed_value;
1386 if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1387 int snprintf_result =
1388 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value);
1389
1390 // Should never overflow; see above.
1391 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1392 }
1393
1394 DelocalizeRadix(buffer);
1395 return buffer;
1396 }
1397
1398 namespace strings {
1399
AlphaNum(strings::Hex hex)1400 AlphaNum::AlphaNum(strings::Hex hex) {
1401 char *const end = &digits[kFastToBufferSize];
1402 char *writer = end;
1403 uint64 value = hex.value;
1404 uint64 width = hex.spec;
1405 // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1406 // where 0x10000 is the smallest hex number that is as wide as the user
1407 // asked for.
1408 uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1409 static const char hexdigits[] = "0123456789abcdef";
1410 do {
1411 *--writer = hexdigits[value & 0xF];
1412 value >>= 4;
1413 mask >>= 4;
1414 } while (mask != 0);
1415 piece_data_ = writer;
1416 piece_size_ = end - writer;
1417 }
1418
1419 } // namespace strings
1420
1421 // ----------------------------------------------------------------------
1422 // StrCat()
1423 // This merges the given strings or integers, with no delimiter. This
1424 // is designed to be the fastest possible way to construct a string out
1425 // of a mix of raw C strings, C++ strings, and integer values.
1426 // ----------------------------------------------------------------------
1427
1428 // Append is merely a version of memcpy that returns the address of the byte
1429 // after the area just overwritten. It comes in multiple flavors to minimize
1430 // call overhead.
Append1(char * out,const AlphaNum & x)1431 static char *Append1(char *out, const AlphaNum &x) {
1432 if (x.size() > 0) {
1433 memcpy(out, x.data(), x.size());
1434 out += x.size();
1435 }
1436 return out;
1437 }
1438
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1439 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1440 if (x1.size() > 0) {
1441 memcpy(out, x1.data(), x1.size());
1442 out += x1.size();
1443 }
1444 if (x2.size() > 0) {
1445 memcpy(out, x2.data(), x2.size());
1446 out += x2.size();
1447 }
1448 return out;
1449 }
1450
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1451 static char *Append4(char *out, const AlphaNum &x1, const AlphaNum &x2,
1452 const AlphaNum &x3, const AlphaNum &x4) {
1453 if (x1.size() > 0) {
1454 memcpy(out, x1.data(), x1.size());
1455 out += x1.size();
1456 }
1457 if (x2.size() > 0) {
1458 memcpy(out, x2.data(), x2.size());
1459 out += x2.size();
1460 }
1461 if (x3.size() > 0) {
1462 memcpy(out, x3.data(), x3.size());
1463 out += x3.size();
1464 }
1465 if (x4.size() > 0) {
1466 memcpy(out, x4.data(), x4.size());
1467 out += x4.size();
1468 }
1469 return out;
1470 }
1471
StrCat(const AlphaNum & a,const AlphaNum & b)1472 std::string StrCat(const AlphaNum &a, const AlphaNum &b) {
1473 std::string result;
1474 result.resize(a.size() + b.size());
1475 char *const begin = &*result.begin();
1476 char *out = Append2(begin, a, b);
1477 GOOGLE_DCHECK_EQ(out, begin + result.size());
1478 return result;
1479 }
1480
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1481 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1482 std::string result;
1483 result.resize(a.size() + b.size() + c.size());
1484 char *const begin = &*result.begin();
1485 char *out = Append2(begin, a, b);
1486 out = Append1(out, c);
1487 GOOGLE_DCHECK_EQ(out, begin + result.size());
1488 return result;
1489 }
1490
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1491 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1492 const AlphaNum &d) {
1493 std::string result;
1494 result.resize(a.size() + b.size() + c.size() + d.size());
1495 char *const begin = &*result.begin();
1496 char *out = Append4(begin, a, b, c, d);
1497 GOOGLE_DCHECK_EQ(out, begin + result.size());
1498 return result;
1499 }
1500
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1501 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1502 const AlphaNum &d, const AlphaNum &e) {
1503 std::string result;
1504 result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1505 char *const begin = &*result.begin();
1506 char *out = Append4(begin, a, b, c, d);
1507 out = Append1(out, e);
1508 GOOGLE_DCHECK_EQ(out, begin + result.size());
1509 return result;
1510 }
1511
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1512 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1513 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1514 std::string result;
1515 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1516 f.size());
1517 char *const begin = &*result.begin();
1518 char *out = Append4(begin, a, b, c, d);
1519 out = Append2(out, e, f);
1520 GOOGLE_DCHECK_EQ(out, begin + result.size());
1521 return result;
1522 }
1523
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1524 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1525 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1526 const AlphaNum &g) {
1527 std::string result;
1528 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1529 f.size() + g.size());
1530 char *const begin = &*result.begin();
1531 char *out = Append4(begin, a, b, c, d);
1532 out = Append2(out, e, f);
1533 out = Append1(out, g);
1534 GOOGLE_DCHECK_EQ(out, begin + result.size());
1535 return result;
1536 }
1537
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1538 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1539 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1540 const AlphaNum &g, const AlphaNum &h) {
1541 std::string result;
1542 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1543 f.size() + g.size() + h.size());
1544 char *const begin = &*result.begin();
1545 char *out = Append4(begin, a, b, c, d);
1546 out = Append4(out, e, f, g, h);
1547 GOOGLE_DCHECK_EQ(out, begin + result.size());
1548 return result;
1549 }
1550
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1551 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1552 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1553 const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1554 std::string result;
1555 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1556 f.size() + g.size() + h.size() + i.size());
1557 char *const begin = &*result.begin();
1558 char *out = Append4(begin, a, b, c, d);
1559 out = Append4(out, e, f, g, h);
1560 out = Append1(out, i);
1561 GOOGLE_DCHECK_EQ(out, begin + result.size());
1562 return result;
1563 }
1564
1565 // It's possible to call StrAppend with a char * pointer that is partway into
1566 // the string we're appending to. However the results of this are random.
1567 // Therefore, check for this in debug mode. Use unsigned math so we only have
1568 // to do one comparison.
1569 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1570 GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1571 uintptr_t((dest).size()))
1572
StrAppend(std::string * result,const AlphaNum & a)1573 void StrAppend(std::string *result, const AlphaNum &a) {
1574 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1575 result->append(a.data(), a.size());
1576 }
1577
StrAppend(std::string * result,const AlphaNum & a,const AlphaNum & b)1578 void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b) {
1579 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1580 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1581 std::string::size_type old_size = result->size();
1582 result->resize(old_size + a.size() + b.size());
1583 char *const begin = &*result->begin();
1584 char *out = Append2(begin + old_size, a, b);
1585 GOOGLE_DCHECK_EQ(out, begin + result->size());
1586 }
1587
StrAppend(std::string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1588 void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b,
1589 const AlphaNum &c) {
1590 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1591 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1592 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1593 std::string::size_type old_size = result->size();
1594 result->resize(old_size + a.size() + b.size() + c.size());
1595 char *const begin = &*result->begin();
1596 char *out = Append2(begin + old_size, a, b);
1597 out = Append1(out, c);
1598 GOOGLE_DCHECK_EQ(out, begin + result->size());
1599 }
1600
StrAppend(std::string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1601 void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b,
1602 const AlphaNum &c, const AlphaNum &d) {
1603 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1604 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1605 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1606 GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1607 std::string::size_type old_size = result->size();
1608 result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1609 char *const begin = &*result->begin();
1610 char *out = Append4(begin + old_size, a, b, c, d);
1611 GOOGLE_DCHECK_EQ(out, begin + result->size());
1612 }
1613
GlobalReplaceSubstring(const std::string & substring,const std::string & replacement,std::string * s)1614 int GlobalReplaceSubstring(const std::string &substring,
1615 const std::string &replacement, std::string *s) {
1616 GOOGLE_CHECK(s != nullptr);
1617 if (s->empty() || substring.empty())
1618 return 0;
1619 std::string tmp;
1620 int num_replacements = 0;
1621 int pos = 0;
1622 for (int match_pos = s->find(substring.data(), pos, substring.length());
1623 match_pos != std::string::npos; pos = match_pos + substring.length(),
1624 match_pos = s->find(substring.data(), pos, substring.length())) {
1625 ++num_replacements;
1626 // Append the original content before the match.
1627 tmp.append(*s, pos, match_pos - pos);
1628 // Append the replacement for the match.
1629 tmp.append(replacement.begin(), replacement.end());
1630 }
1631 // Append the content after the last match. If no replacements were made, the
1632 // original string is left untouched.
1633 if (num_replacements > 0) {
1634 tmp.append(*s, pos, s->length() - pos);
1635 s->swap(tmp);
1636 }
1637 return num_replacements;
1638 }
1639
CalculateBase64EscapedLen(int input_len,bool do_padding)1640 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1641 // Base64 encodes three bytes of input at a time. If the input is not
1642 // divisible by three, we pad as appropriate.
1643 //
1644 // (from http://tools.ietf.org/html/rfc3548)
1645 // Special processing is performed if fewer than 24 bits are available
1646 // at the end of the data being encoded. A full encoding quantum is
1647 // always completed at the end of a quantity. When fewer than 24 input
1648 // bits are available in an input group, zero bits are added (on the
1649 // right) to form an integral number of 6-bit groups. Padding at the
1650 // end of the data is performed using the '=' character. Since all base
1651 // 64 input is an integral number of octets, only the following cases
1652 // can arise:
1653
1654
1655 // Base64 encodes each three bytes of input into four bytes of output.
1656 int len = (input_len / 3) * 4;
1657
1658 if (input_len % 3 == 0) {
1659 // (from http://tools.ietf.org/html/rfc3548)
1660 // (1) the final quantum of encoding input is an integral multiple of 24
1661 // bits; here, the final unit of encoded output will be an integral
1662 // multiple of 4 characters with no "=" padding,
1663 } else if (input_len % 3 == 1) {
1664 // (from http://tools.ietf.org/html/rfc3548)
1665 // (2) the final quantum of encoding input is exactly 8 bits; here, the
1666 // final unit of encoded output will be two characters followed by two
1667 // "=" padding characters, or
1668 len += 2;
1669 if (do_padding) {
1670 len += 2;
1671 }
1672 } else { // (input_len % 3 == 2)
1673 // (from http://tools.ietf.org/html/rfc3548)
1674 // (3) the final quantum of encoding input is exactly 16 bits; here, the
1675 // final unit of encoded output will be three characters followed by one
1676 // "=" padding character.
1677 len += 3;
1678 if (do_padding) {
1679 len += 1;
1680 }
1681 }
1682
1683 assert(len >= input_len); // make sure we didn't overflow
1684 return len;
1685 }
1686
1687 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1688 int CalculateBase64EscapedLen(int input_len) {
1689 return CalculateBase64EscapedLen(input_len, true);
1690 }
1691
1692 // ----------------------------------------------------------------------
1693 // int Base64Unescape() - base64 decoder
1694 // int Base64Escape() - base64 encoder
1695 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1696 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1697 //
1698 // Check out
1699 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1700 // care about is that...
1701 // Take the encoded stuff in groups of 4 characters and turn each
1702 // character into a code 0 to 63 thus:
1703 // A-Z map to 0 to 25
1704 // a-z map to 26 to 51
1705 // 0-9 map to 52 to 61
1706 // +(- for WebSafe) maps to 62
1707 // /(_ for WebSafe) maps to 63
1708 // There will be four numbers, all less than 64 which can be represented
1709 // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1710 // Arrange the 6 digit binary numbers into three bytes as such:
1711 // aaaaaabb bbbbcccc ccdddddd
1712 // Equals signs (one or two) are used at the end of the encoded block to
1713 // indicate that the text was not an integer multiple of three bytes long.
1714 // ----------------------------------------------------------------------
1715
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1716 int Base64UnescapeInternal(const char *src_param, int szsrc,
1717 char *dest, int szdest,
1718 const signed char* unbase64) {
1719 static const char kPad64Equals = '=';
1720 static const char kPad64Dot = '.';
1721
1722 int decode = 0;
1723 int destidx = 0;
1724 int state = 0;
1725 unsigned int ch = 0;
1726 unsigned int temp = 0;
1727
1728 // If "char" is signed by default, using *src as an array index results in
1729 // accessing negative array elements. Treat the input as a pointer to
1730 // unsigned char to avoid this.
1731 const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1732
1733 // The GET_INPUT macro gets the next input character, skipping
1734 // over any whitespace, and stopping when we reach the end of the
1735 // string or when we read any non-data character. The arguments are
1736 // an arbitrary identifier (used as a label for goto) and the number
1737 // of data bytes that must remain in the input to avoid aborting the
1738 // loop.
1739 #define GET_INPUT(label, remain) \
1740 label: \
1741 --szsrc; \
1742 ch = *src++; \
1743 decode = unbase64[ch]; \
1744 if (decode < 0) { \
1745 if (ascii_isspace(ch) && szsrc >= remain) \
1746 goto label; \
1747 state = 4 - remain; \
1748 break; \
1749 }
1750
1751 // if dest is null, we're just checking to see if it's legal input
1752 // rather than producing output. (I suspect this could just be done
1753 // with a regexp...). We duplicate the loop so this test can be
1754 // outside it instead of in every iteration.
1755
1756 if (dest) {
1757 // This loop consumes 4 input bytes and produces 3 output bytes
1758 // per iteration. We can't know at the start that there is enough
1759 // data left in the string for a full iteration, so the loop may
1760 // break out in the middle; if so 'state' will be set to the
1761 // number of input bytes read.
1762
1763 while (szsrc >= 4) {
1764 // We'll start by optimistically assuming that the next four
1765 // bytes of the string (src[0..3]) are four good data bytes
1766 // (that is, no nulls, whitespace, padding chars, or illegal
1767 // chars). We need to test src[0..2] for nulls individually
1768 // before constructing temp to preserve the property that we
1769 // never read past a null in the string (no matter how long
1770 // szsrc claims the string is).
1771
1772 if (!src[0] || !src[1] || !src[2] ||
1773 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1774 (unsigned(unbase64[src[1]]) << 12) |
1775 (unsigned(unbase64[src[2]]) << 6) |
1776 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1777 // Iff any of those four characters was bad (null, illegal,
1778 // whitespace, padding), then temp's high bit will be set
1779 // (because unbase64[] is -1 for all bad characters).
1780 //
1781 // We'll back up and resort to the slower decoder, which knows
1782 // how to handle those cases.
1783
1784 GET_INPUT(first, 4);
1785 temp = decode;
1786 GET_INPUT(second, 3);
1787 temp = (temp << 6) | decode;
1788 GET_INPUT(third, 2);
1789 temp = (temp << 6) | decode;
1790 GET_INPUT(fourth, 1);
1791 temp = (temp << 6) | decode;
1792 } else {
1793 // We really did have four good data bytes, so advance four
1794 // characters in the string.
1795
1796 szsrc -= 4;
1797 src += 4;
1798 decode = -1;
1799 ch = '\0';
1800 }
1801
1802 // temp has 24 bits of input, so write that out as three bytes.
1803
1804 if (destidx+3 > szdest) return -1;
1805 dest[destidx+2] = temp;
1806 temp >>= 8;
1807 dest[destidx+1] = temp;
1808 temp >>= 8;
1809 dest[destidx] = temp;
1810 destidx += 3;
1811 }
1812 } else {
1813 while (szsrc >= 4) {
1814 if (!src[0] || !src[1] || !src[2] ||
1815 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1816 (unsigned(unbase64[src[1]]) << 12) |
1817 (unsigned(unbase64[src[2]]) << 6) |
1818 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1819 GET_INPUT(first_no_dest, 4);
1820 GET_INPUT(second_no_dest, 3);
1821 GET_INPUT(third_no_dest, 2);
1822 GET_INPUT(fourth_no_dest, 1);
1823 } else {
1824 szsrc -= 4;
1825 src += 4;
1826 decode = -1;
1827 ch = '\0';
1828 }
1829 destidx += 3;
1830 }
1831 }
1832
1833 #undef GET_INPUT
1834
1835 // if the loop terminated because we read a bad character, return
1836 // now.
1837 if (decode < 0 && ch != '\0' &&
1838 ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1839 return -1;
1840
1841 if (ch == kPad64Equals || ch == kPad64Dot) {
1842 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1843 // look at it again when we count to check for the proper number of
1844 // equals signs at the end.
1845 ++szsrc;
1846 --src;
1847 } else {
1848 // This loop consumes 1 input byte per iteration. It's used to
1849 // clean up the 0-3 input bytes remaining when the first, faster
1850 // loop finishes. 'temp' contains the data from 'state' input
1851 // characters read by the first loop.
1852 while (szsrc > 0) {
1853 --szsrc;
1854 ch = *src++;
1855 decode = unbase64[ch];
1856 if (decode < 0) {
1857 if (ascii_isspace(ch)) {
1858 continue;
1859 } else if (ch == '\0') {
1860 break;
1861 } else if (ch == kPad64Equals || ch == kPad64Dot) {
1862 // back up one character; we'll read it again when we check
1863 // for the correct number of pad characters at the end.
1864 ++szsrc;
1865 --src;
1866 break;
1867 } else {
1868 return -1;
1869 }
1870 }
1871
1872 // Each input character gives us six bits of output.
1873 temp = (temp << 6) | decode;
1874 ++state;
1875 if (state == 4) {
1876 // If we've accumulated 24 bits of output, write that out as
1877 // three bytes.
1878 if (dest) {
1879 if (destidx+3 > szdest) return -1;
1880 dest[destidx+2] = temp;
1881 temp >>= 8;
1882 dest[destidx+1] = temp;
1883 temp >>= 8;
1884 dest[destidx] = temp;
1885 }
1886 destidx += 3;
1887 state = 0;
1888 temp = 0;
1889 }
1890 }
1891 }
1892
1893 // Process the leftover data contained in 'temp' at the end of the input.
1894 int expected_equals = 0;
1895 switch (state) {
1896 case 0:
1897 // Nothing left over; output is a multiple of 3 bytes.
1898 break;
1899
1900 case 1:
1901 // Bad input; we have 6 bits left over.
1902 return -1;
1903
1904 case 2:
1905 // Produce one more output byte from the 12 input bits we have left.
1906 if (dest) {
1907 if (destidx+1 > szdest) return -1;
1908 temp >>= 4;
1909 dest[destidx] = temp;
1910 }
1911 ++destidx;
1912 expected_equals = 2;
1913 break;
1914
1915 case 3:
1916 // Produce two more output bytes from the 18 input bits we have left.
1917 if (dest) {
1918 if (destidx+2 > szdest) return -1;
1919 temp >>= 2;
1920 dest[destidx+1] = temp;
1921 temp >>= 8;
1922 dest[destidx] = temp;
1923 }
1924 destidx += 2;
1925 expected_equals = 1;
1926 break;
1927
1928 default:
1929 // state should have no other values at this point.
1930 GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1931 }
1932
1933 // The remainder of the string should be all whitespace, mixed with
1934 // exactly 0 equals signs, or exactly 'expected_equals' equals
1935 // signs. (Always accepting 0 equals signs is a google extension
1936 // not covered in the RFC, as is accepting dot as the pad character.)
1937
1938 int equals = 0;
1939 while (szsrc > 0 && *src) {
1940 if (*src == kPad64Equals || *src == kPad64Dot)
1941 ++equals;
1942 else if (!ascii_isspace(*src))
1943 return -1;
1944 --szsrc;
1945 ++src;
1946 }
1947
1948 return (equals == 0 || equals == expected_equals) ? destidx : -1;
1949 }
1950
1951 // The arrays below were generated by the following code
1952 // #include <sys/time.h>
1953 // #include <stdlib.h>
1954 // #include <string.h>
1955 // #include <stdio.h>
1956 // main()
1957 // {
1958 // static const char Base64[] =
1959 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1960 // const char *pos;
1961 // int idx, i, j;
1962 // printf(" ");
1963 // for (i = 0; i < 255; i += 8) {
1964 // for (j = i; j < i + 8; j++) {
1965 // pos = strchr(Base64, j);
1966 // if ((pos == nullptr) || (j == 0))
1967 // idx = -1;
1968 // else
1969 // idx = pos - Base64;
1970 // if (idx == -1)
1971 // printf(" %2d, ", idx);
1972 // else
1973 // printf(" %2d/""*%c*""/,", idx, j);
1974 // }
1975 // printf("\n ");
1976 // }
1977 // }
1978 //
1979 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1980 // tables from the functions below.
1981 static const signed char kUnBase64[] = {
1982 -1, -1, -1, -1, -1, -1, -1, -1,
1983 -1, -1, -1, -1, -1, -1, -1, -1,
1984 -1, -1, -1, -1, -1, -1, -1, -1,
1985 -1, -1, -1, -1, -1, -1, -1, -1,
1986 -1, -1, -1, -1, -1, -1, -1, -1,
1987 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
1988 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1989 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
1990 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
1991 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1992 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1993 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
1994 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
1995 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
1996 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
1997 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
1998 -1, -1, -1, -1, -1, -1, -1, -1,
1999 -1, -1, -1, -1, -1, -1, -1, -1,
2000 -1, -1, -1, -1, -1, -1, -1, -1,
2001 -1, -1, -1, -1, -1, -1, -1, -1,
2002 -1, -1, -1, -1, -1, -1, -1, -1,
2003 -1, -1, -1, -1, -1, -1, -1, -1,
2004 -1, -1, -1, -1, -1, -1, -1, -1,
2005 -1, -1, -1, -1, -1, -1, -1, -1,
2006 -1, -1, -1, -1, -1, -1, -1, -1,
2007 -1, -1, -1, -1, -1, -1, -1, -1,
2008 -1, -1, -1, -1, -1, -1, -1, -1,
2009 -1, -1, -1, -1, -1, -1, -1, -1,
2010 -1, -1, -1, -1, -1, -1, -1, -1,
2011 -1, -1, -1, -1, -1, -1, -1, -1,
2012 -1, -1, -1, -1, -1, -1, -1, -1,
2013 -1, -1, -1, -1, -1, -1, -1, -1
2014 };
2015 static const signed char kUnWebSafeBase64[] = {
2016 -1, -1, -1, -1, -1, -1, -1, -1,
2017 -1, -1, -1, -1, -1, -1, -1, -1,
2018 -1, -1, -1, -1, -1, -1, -1, -1,
2019 -1, -1, -1, -1, -1, -1, -1, -1,
2020 -1, -1, -1, -1, -1, -1, -1, -1,
2021 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
2022 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2023 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
2024 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
2025 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2026 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2027 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
2028 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2029 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2030 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2031 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
2032 -1, -1, -1, -1, -1, -1, -1, -1,
2033 -1, -1, -1, -1, -1, -1, -1, -1,
2034 -1, -1, -1, -1, -1, -1, -1, -1,
2035 -1, -1, -1, -1, -1, -1, -1, -1,
2036 -1, -1, -1, -1, -1, -1, -1, -1,
2037 -1, -1, -1, -1, -1, -1, -1, -1,
2038 -1, -1, -1, -1, -1, -1, -1, -1,
2039 -1, -1, -1, -1, -1, -1, -1, -1,
2040 -1, -1, -1, -1, -1, -1, -1, -1,
2041 -1, -1, -1, -1, -1, -1, -1, -1,
2042 -1, -1, -1, -1, -1, -1, -1, -1,
2043 -1, -1, -1, -1, -1, -1, -1, -1,
2044 -1, -1, -1, -1, -1, -1, -1, -1,
2045 -1, -1, -1, -1, -1, -1, -1, -1,
2046 -1, -1, -1, -1, -1, -1, -1, -1,
2047 -1, -1, -1, -1, -1, -1, -1, -1
2048 };
2049
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2050 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2051 return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2052 }
2053
Base64UnescapeInternal(const char * src,int slen,std::string * dest,const signed char * unbase64)2054 static bool Base64UnescapeInternal(const char *src, int slen, std::string *dest,
2055 const signed char *unbase64) {
2056 // Determine the size of the output string. Base64 encodes every 3 bytes into
2057 // 4 characters. any leftover chars are added directly for good measure.
2058 // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2059 const int dest_len = 3 * (slen / 4) + (slen % 4);
2060
2061 dest->resize(dest_len);
2062
2063 // We are getting the destination buffer by getting the beginning of the
2064 // string and converting it into a char *.
2065 const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2066 dest_len, unbase64);
2067 if (len < 0) {
2068 dest->clear();
2069 return false;
2070 }
2071
2072 // could be shorter if there was padding
2073 GOOGLE_DCHECK_LE(len, dest_len);
2074 dest->erase(len);
2075
2076 return true;
2077 }
2078
Base64Unescape(StringPiece src,std::string * dest)2079 bool Base64Unescape(StringPiece src, std::string *dest) {
2080 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2081 }
2082
WebSafeBase64Unescape(StringPiece src,std::string * dest)2083 bool WebSafeBase64Unescape(StringPiece src, std::string *dest) {
2084 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2085 }
2086
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2087 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2088 char *dest, int szdest, const char *base64,
2089 bool do_padding) {
2090 static const char kPad64 = '=';
2091
2092 if (szsrc <= 0) return 0;
2093
2094 if (szsrc * 4 > szdest * 3) return 0;
2095
2096 char *cur_dest = dest;
2097 const unsigned char *cur_src = src;
2098
2099 char *limit_dest = dest + szdest;
2100 const unsigned char *limit_src = src + szsrc;
2101
2102 // Three bytes of data encodes to four characters of cyphertext.
2103 // So we can pump through three-byte chunks atomically.
2104 while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits
2105 uint32 in = BigEndian::Load32(cur_src) >> 8;
2106
2107 cur_dest[0] = base64[in >> 18];
2108 in &= 0x3FFFF;
2109 cur_dest[1] = base64[in >> 12];
2110 in &= 0xFFF;
2111 cur_dest[2] = base64[in >> 6];
2112 in &= 0x3F;
2113 cur_dest[3] = base64[in];
2114
2115 cur_dest += 4;
2116 cur_src += 3;
2117 }
2118 // To save time, we didn't update szdest or szsrc in the loop. So do it now.
2119 szdest = limit_dest - cur_dest;
2120 szsrc = limit_src - cur_src;
2121
2122 /* now deal with the tail (<=3 bytes) */
2123 switch (szsrc) {
2124 case 0:
2125 // Nothing left; nothing more to do.
2126 break;
2127 case 1: {
2128 // One byte left: this encodes to two characters, and (optionally)
2129 // two pad characters to round out the four-character cypherblock.
2130 if ((szdest -= 2) < 0) return 0;
2131 uint32 in = cur_src[0];
2132 cur_dest[0] = base64[in >> 2];
2133 in &= 0x3;
2134 cur_dest[1] = base64[in << 4];
2135 cur_dest += 2;
2136 if (do_padding) {
2137 if ((szdest -= 2) < 0) return 0;
2138 cur_dest[0] = kPad64;
2139 cur_dest[1] = kPad64;
2140 cur_dest += 2;
2141 }
2142 break;
2143 }
2144 case 2: {
2145 // Two bytes left: this encodes to three characters, and (optionally)
2146 // one pad character to round out the four-character cypherblock.
2147 if ((szdest -= 3) < 0) return 0;
2148 uint32 in = BigEndian::Load16(cur_src);
2149 cur_dest[0] = base64[in >> 10];
2150 in &= 0x3FF;
2151 cur_dest[1] = base64[in >> 4];
2152 in &= 0x00F;
2153 cur_dest[2] = base64[in << 2];
2154 cur_dest += 3;
2155 if (do_padding) {
2156 if ((szdest -= 1) < 0) return 0;
2157 cur_dest[0] = kPad64;
2158 cur_dest += 1;
2159 }
2160 break;
2161 }
2162 case 3: {
2163 // Three bytes left: same as in the big loop above. We can't do this in
2164 // the loop because the loop above always reads 4 bytes, and the fourth
2165 // byte is past the end of the input.
2166 if ((szdest -= 4) < 0) return 0;
2167 uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2168 cur_dest[0] = base64[in >> 18];
2169 in &= 0x3FFFF;
2170 cur_dest[1] = base64[in >> 12];
2171 in &= 0xFFF;
2172 cur_dest[2] = base64[in >> 6];
2173 in &= 0x3F;
2174 cur_dest[3] = base64[in];
2175 cur_dest += 4;
2176 break;
2177 }
2178 default:
2179 // Should not be reached: blocks of 4 bytes are handled
2180 // in the while loop before this switch statement.
2181 GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2182 break;
2183 }
2184 return (cur_dest - dest);
2185 }
2186
2187 static const char kBase64Chars[] =
2188 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2189
2190 static const char kWebSafeBase64Chars[] =
2191 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2192
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2193 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2194 return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2195 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2196 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2197 int szdest, bool do_padding) {
2198 return Base64EscapeInternal(src, szsrc, dest, szdest,
2199 kWebSafeBase64Chars, do_padding);
2200 }
2201
Base64EscapeInternal(const unsigned char * src,int szsrc,std::string * dest,bool do_padding,const char * base64_chars)2202 void Base64EscapeInternal(const unsigned char *src, int szsrc,
2203 std::string *dest, bool do_padding,
2204 const char *base64_chars) {
2205 const int calc_escaped_size =
2206 CalculateBase64EscapedLen(szsrc, do_padding);
2207 dest->resize(calc_escaped_size);
2208 const int escaped_len = Base64EscapeInternal(src, szsrc,
2209 string_as_array(dest),
2210 dest->size(),
2211 base64_chars,
2212 do_padding);
2213 GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2214 dest->erase(escaped_len);
2215 }
2216
Base64Escape(const unsigned char * src,int szsrc,std::string * dest,bool do_padding)2217 void Base64Escape(const unsigned char *src, int szsrc, std::string *dest,
2218 bool do_padding) {
2219 Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2220 }
2221
WebSafeBase64Escape(const unsigned char * src,int szsrc,std::string * dest,bool do_padding)2222 void WebSafeBase64Escape(const unsigned char *src, int szsrc, std::string *dest,
2223 bool do_padding) {
2224 Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2225 }
2226
Base64Escape(StringPiece src,std::string * dest)2227 void Base64Escape(StringPiece src, std::string *dest) {
2228 Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2229 src.size(), dest, true);
2230 }
2231
WebSafeBase64Escape(StringPiece src,std::string * dest)2232 void WebSafeBase64Escape(StringPiece src, std::string *dest) {
2233 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2234 src.size(), dest, false);
2235 }
2236
WebSafeBase64EscapeWithPadding(StringPiece src,std::string * dest)2237 void WebSafeBase64EscapeWithPadding(StringPiece src, std::string *dest) {
2238 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2239 src.size(), dest, true);
2240 }
2241
2242 // Helper to append a Unicode code point to a string as UTF8, without bringing
2243 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2244 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2245 uint32 tmp = 0;
2246 int len = 0;
2247 if (code_point <= 0x7f) {
2248 tmp = code_point;
2249 len = 1;
2250 } else if (code_point <= 0x07ff) {
2251 tmp = 0x0000c080 |
2252 ((code_point & 0x07c0) << 2) |
2253 (code_point & 0x003f);
2254 len = 2;
2255 } else if (code_point <= 0xffff) {
2256 tmp = 0x00e08080 |
2257 ((code_point & 0xf000) << 4) |
2258 ((code_point & 0x0fc0) << 2) |
2259 (code_point & 0x003f);
2260 len = 3;
2261 } else {
2262 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2263 // normally only defined up to there as well.
2264 tmp = 0xf0808080 |
2265 ((code_point & 0x1c0000) << 6) |
2266 ((code_point & 0x03f000) << 4) |
2267 ((code_point & 0x000fc0) << 2) |
2268 (code_point & 0x003f);
2269 len = 4;
2270 }
2271 tmp = ghtonl(tmp);
2272 memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2273 return len;
2274 }
2275
2276 // Table of UTF-8 character lengths, based on first byte
2277 static const unsigned char kUTF8LenTbl[256] = {
2278 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2279 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2280 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2281 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2282 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2283 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2284
2285 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2286 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2287 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2288 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2289 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2290 3, 3, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2291
2292 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2293 int UTF8FirstLetterNumBytes(const char* src, int len) {
2294 if (len == 0) {
2295 return 0;
2296 }
2297 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2298 }
2299
2300 // ----------------------------------------------------------------------
2301 // CleanStringLineEndings()
2302 // Clean up a multi-line string to conform to Unix line endings.
2303 // Reads from src and appends to dst, so usually dst should be empty.
2304 //
2305 // If there is no line ending at the end of a non-empty string, it can
2306 // be added automatically.
2307 //
2308 // Four different types of input are correctly handled:
2309 //
2310 // - Unix/Linux files: line ending is LF: pass through unchanged
2311 //
2312 // - DOS/Windows files: line ending is CRLF: convert to LF
2313 //
2314 // - Legacy Mac files: line ending is CR: convert to LF
2315 //
2316 // - Garbled files: random line endings: convert gracefully
2317 // lonely CR, lonely LF, CRLF: convert to LF
2318 //
2319 // @param src The multi-line string to convert
2320 // @param dst The converted string is appended to this string
2321 // @param auto_end_last_line Automatically terminate the last line
2322 //
2323 // Limitations:
2324 //
2325 // This does not do the right thing for CRCRLF files created by
2326 // broken programs that do another Unix->DOS conversion on files
2327 // that are already in CRLF format. For this, a two-pass approach
2328 // brute-force would be needed that
2329 //
2330 // (1) determines the presence of LF (first one is ok)
2331 // (2) if yes, removes any CR, else convert every CR to LF
2332
CleanStringLineEndings(const std::string & src,std::string * dst,bool auto_end_last_line)2333 void CleanStringLineEndings(const std::string &src, std::string *dst,
2334 bool auto_end_last_line) {
2335 if (dst->empty()) {
2336 dst->append(src);
2337 CleanStringLineEndings(dst, auto_end_last_line);
2338 } else {
2339 std::string tmp = src;
2340 CleanStringLineEndings(&tmp, auto_end_last_line);
2341 dst->append(tmp);
2342 }
2343 }
2344
CleanStringLineEndings(std::string * str,bool auto_end_last_line)2345 void CleanStringLineEndings(std::string *str, bool auto_end_last_line) {
2346 ptrdiff_t output_pos = 0;
2347 bool r_seen = false;
2348 ptrdiff_t len = str->size();
2349
2350 char *p = &(*str)[0];
2351
2352 for (ptrdiff_t input_pos = 0; input_pos < len;) {
2353 if (!r_seen && input_pos + 8 < len) {
2354 uint64_t v = GOOGLE_UNALIGNED_LOAD64(p + input_pos);
2355 // Loop over groups of 8 bytes at a time until we come across
2356 // a word that has a byte whose value is less than or equal to
2357 // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
2358 //
2359 // We use a has_less macro that quickly tests a whole 64-bit
2360 // word to see if any of the bytes has a value < N.
2361 //
2362 // For more details, see:
2363 // http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
2364 #define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128)
2365 if (!has_less(v, '\r' + 1)) {
2366 #undef has_less
2367 // No byte in this word has a value that could be a \r or a \n
2368 if (output_pos != input_pos) {
2369 GOOGLE_UNALIGNED_STORE64(p + output_pos, v);
2370 }
2371 input_pos += 8;
2372 output_pos += 8;
2373 continue;
2374 }
2375 }
2376 std::string::const_reference in = p[input_pos];
2377 if (in == '\r') {
2378 if (r_seen) p[output_pos++] = '\n';
2379 r_seen = true;
2380 } else if (in == '\n') {
2381 if (input_pos != output_pos)
2382 p[output_pos++] = '\n';
2383 else
2384 output_pos++;
2385 r_seen = false;
2386 } else {
2387 if (r_seen) p[output_pos++] = '\n';
2388 r_seen = false;
2389 if (input_pos != output_pos)
2390 p[output_pos++] = in;
2391 else
2392 output_pos++;
2393 }
2394 input_pos++;
2395 }
2396 if (r_seen ||
2397 (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) {
2398 str->resize(output_pos + 1);
2399 str->operator[](output_pos) = '\n';
2400 } else if (output_pos < len) {
2401 str->resize(output_pos);
2402 }
2403 }
2404
2405 namespace internal {
2406
2407 // ----------------------------------------------------------------------
2408 // NoLocaleStrtod()
2409 // This code will make you cry.
2410 // ----------------------------------------------------------------------
2411
2412 namespace {
2413
2414 // Returns a string identical to *input except that the character pointed to
2415 // by radix_pos (which should be '.') is replaced with the locale-specific
2416 // radix character.
LocalizeRadix(const char * input,const char * radix_pos)2417 std::string LocalizeRadix(const char *input, const char *radix_pos) {
2418 // Determine the locale-specific radix character by calling sprintf() to
2419 // print the number 1.5, then stripping off the digits. As far as I can
2420 // tell, this is the only portable, thread-safe way to get the C library
2421 // to divuldge the locale's radix character. No, localeconv() is NOT
2422 // thread-safe.
2423 char temp[16];
2424 int size = snprintf(temp, sizeof(temp), "%.1f", 1.5);
2425 GOOGLE_CHECK_EQ(temp[0], '1');
2426 GOOGLE_CHECK_EQ(temp[size - 1], '5');
2427 GOOGLE_CHECK_LE(size, 6);
2428
2429 // Now replace the '.' in the input with it.
2430 std::string result;
2431 result.reserve(strlen(input) + size - 3);
2432 result.append(input, radix_pos);
2433 result.append(temp + 1, size - 2);
2434 result.append(radix_pos + 1);
2435 return result;
2436 }
2437
2438 } // namespace
2439
NoLocaleStrtod(const char * str,char ** endptr)2440 double NoLocaleStrtod(const char *str, char **endptr) {
2441 // We cannot simply set the locale to "C" temporarily with setlocale()
2442 // as this is not thread-safe. Instead, we try to parse in the current
2443 // locale first. If parsing stops at a '.' character, then this is a
2444 // pretty good hint that we're actually in some other locale in which
2445 // '.' is not the radix character.
2446
2447 char *temp_endptr;
2448 double result = strtod(str, &temp_endptr);
2449 if (endptr != NULL) *endptr = temp_endptr;
2450 if (*temp_endptr != '.') return result;
2451
2452 // Parsing halted on a '.'. Perhaps we're in a different locale? Let's
2453 // try to replace the '.' with a locale-specific radix character and
2454 // try again.
2455 std::string localized = LocalizeRadix(str, temp_endptr);
2456 const char *localized_cstr = localized.c_str();
2457 char *localized_endptr;
2458 result = strtod(localized_cstr, &localized_endptr);
2459 if ((localized_endptr - localized_cstr) > (temp_endptr - str)) {
2460 // This attempt got further, so replacing the decimal must have helped.
2461 // Update endptr to point at the right location.
2462 if (endptr != NULL) {
2463 // size_diff is non-zero if the localized radix has multiple bytes.
2464 int size_diff = localized.size() - strlen(str);
2465 // const_cast is necessary to match the strtod() interface.
2466 *endptr = const_cast<char *>(
2467 str + (localized_endptr - localized_cstr - size_diff));
2468 }
2469 }
2470
2471 return result;
2472 }
2473
2474 } // namespace internal
2475
2476 } // namespace protobuf
2477 } // namespace google
2478