• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // from google3/strings/strutil.cc
32 
33 #include <google/protobuf/stubs/strutil.h>
34 
35 #include <errno.h>
36 #include <float.h>    // FLT_DIG and DBL_DIG
37 #include <limits.h>
38 #include <stdio.h>
39 #include <cmath>
40 #include <iterator>
41 #include <limits>
42 
43 #include <google/protobuf/stubs/logging.h>
44 #include <google/protobuf/stubs/stl_util.h>
45 
46 #ifdef _WIN32
47 // MSVC has only _snprintf, not snprintf.
48 //
49 // MinGW has both snprintf and _snprintf, but they appear to be different
50 // functions.  The former is buggy.  When invoked like so:
51 //   char buffer[32];
52 //   snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
53 // it prints "1.23000e+10".  This is plainly wrong:  %g should never print
54 // trailing zeros after the decimal point.  For some reason this bug only
55 // occurs with some input values, not all.  In any case, _snprintf does the
56 // right thing, so we use it.
57 #define snprintf _snprintf
58 #endif
59 
60 namespace google {
61 namespace protobuf {
62 
63 // These are defined as macros on some platforms.  #undef them so that we can
64 // redefine them.
65 #undef isxdigit
66 #undef isprint
67 
68 // The definitions of these in ctype.h change based on locale.  Since our
69 // string manipulation is all in relation to the protocol buffer and C++
70 // languages, we always want to use the C locale.  So, we re-define these
71 // exactly as we want them.
isxdigit(char c)72 inline bool isxdigit(char c) {
73   return ('0' <= c && c <= '9') ||
74          ('a' <= c && c <= 'f') ||
75          ('A' <= c && c <= 'F');
76 }
77 
isprint(char c)78 inline bool isprint(char c) {
79   return c >= 0x20 && c <= 0x7E;
80 }
81 
82 // ----------------------------------------------------------------------
83 // ReplaceCharacters
84 //    Replaces any occurrence of the character 'remove' (or the characters
85 //    in 'remove') with the character 'replacewith'.
86 // ----------------------------------------------------------------------
ReplaceCharacters(std::string * s,const char * remove,char replacewith)87 void ReplaceCharacters(std::string *s, const char *remove, char replacewith) {
88   const char *str_start = s->c_str();
89   const char *str = str_start;
90   for (str = strpbrk(str, remove);
91        str != nullptr;
92        str = strpbrk(str + 1, remove)) {
93     (*s)[str - str_start] = replacewith;
94   }
95 }
96 
StripWhitespace(std::string * str)97 void StripWhitespace(std::string *str) {
98   int str_length = str->length();
99 
100   // Strip off leading whitespace.
101   int first = 0;
102   while (first < str_length && ascii_isspace(str->at(first))) {
103     ++first;
104   }
105   // If entire string is white space.
106   if (first == str_length) {
107     str->clear();
108     return;
109   }
110   if (first > 0) {
111     str->erase(0, first);
112     str_length -= first;
113   }
114 
115   // Strip off trailing whitespace.
116   int last = str_length - 1;
117   while (last >= 0 && ascii_isspace(str->at(last))) {
118     --last;
119   }
120   if (last != (str_length - 1) && last >= 0) {
121     str->erase(last + 1, std::string::npos);
122   }
123 }
124 
125 // ----------------------------------------------------------------------
126 // StringReplace()
127 //    Replace the "old" pattern with the "new" pattern in a string,
128 //    and append the result to "res".  If replace_all is false,
129 //    it only replaces the first instance of "old."
130 // ----------------------------------------------------------------------
131 
StringReplace(const std::string & s,const std::string & oldsub,const std::string & newsub,bool replace_all,std::string * res)132 void StringReplace(const std::string &s, const std::string &oldsub,
133                    const std::string &newsub, bool replace_all,
134                    std::string *res) {
135   if (oldsub.empty()) {
136     res->append(s);  // if empty, append the given string.
137     return;
138   }
139 
140   std::string::size_type start_pos = 0;
141   std::string::size_type pos;
142   do {
143     pos = s.find(oldsub, start_pos);
144     if (pos == std::string::npos) {
145       break;
146     }
147     res->append(s, start_pos, pos - start_pos);
148     res->append(newsub);
149     start_pos = pos + oldsub.size();  // start searching again after the "old"
150   } while (replace_all);
151   res->append(s, start_pos, s.length() - start_pos);
152 }
153 
154 // ----------------------------------------------------------------------
155 // StringReplace()
156 //    Give me a string and two patterns "old" and "new", and I replace
157 //    the first instance of "old" in the string with "new", if it
158 //    exists.  If "global" is true; call this repeatedly until it
159 //    fails.  RETURN a new string, regardless of whether the replacement
160 //    happened or not.
161 // ----------------------------------------------------------------------
162 
StringReplace(const std::string & s,const std::string & oldsub,const std::string & newsub,bool replace_all)163 std::string StringReplace(const std::string &s, const std::string &oldsub,
164                           const std::string &newsub, bool replace_all) {
165   std::string ret;
166   StringReplace(s, oldsub, newsub, replace_all, &ret);
167   return ret;
168 }
169 
170 // ----------------------------------------------------------------------
171 // SplitStringUsing()
172 //    Split a string using a character delimiter. Append the components
173 //    to 'result'.
174 //
175 // Note: For multi-character delimiters, this routine will split on *ANY* of
176 // the characters in the string, not the entire string as a single delimiter.
177 // ----------------------------------------------------------------------
178 template <typename ITR>
SplitStringToIteratorUsing(StringPiece full,const char * delim,ITR & result)179 static inline void SplitStringToIteratorUsing(StringPiece full,
180                                               const char *delim, ITR &result) {
181   // Optimize the common case where delim is a single character.
182   if (delim[0] != '\0' && delim[1] == '\0') {
183     char c = delim[0];
184     const char* p = full.data();
185     const char* end = p + full.size();
186     while (p != end) {
187       if (*p == c) {
188         ++p;
189       } else {
190         const char* start = p;
191         while (++p != end && *p != c);
192         *result++ = std::string(start, p - start);
193       }
194     }
195     return;
196   }
197 
198   std::string::size_type begin_index, end_index;
199   begin_index = full.find_first_not_of(delim);
200   while (begin_index != std::string::npos) {
201     end_index = full.find_first_of(delim, begin_index);
202     if (end_index == std::string::npos) {
203       *result++ = std::string(full.substr(begin_index));
204       return;
205     }
206     *result++ =
207         std::string(full.substr(begin_index, (end_index - begin_index)));
208     begin_index = full.find_first_not_of(delim, end_index);
209   }
210 }
211 
SplitStringUsing(StringPiece full,const char * delim,std::vector<std::string> * result)212 void SplitStringUsing(StringPiece full, const char *delim,
213                       std::vector<std::string> *result) {
214   std::back_insert_iterator<std::vector<std::string> > it(*result);
215   SplitStringToIteratorUsing(full, delim, it);
216 }
217 
218 // Split a string using a character delimiter. Append the components
219 // to 'result'.  If there are consecutive delimiters, this function
220 // will return corresponding empty strings. The string is split into
221 // at most the specified number of pieces greedily. This means that the
222 // last piece may possibly be split further. To split into as many pieces
223 // as possible, specify 0 as the number of pieces.
224 //
225 // If "full" is the empty string, yields an empty string as the only value.
226 //
227 // If "pieces" is negative for some reason, it returns the whole string
228 // ----------------------------------------------------------------------
229 template <typename ITR>
SplitStringToIteratorAllowEmpty(StringPiece full,const char * delim,int pieces,ITR & result)230 static inline void SplitStringToIteratorAllowEmpty(StringPiece full,
231                                                    const char *delim,
232                                                    int pieces, ITR &result) {
233   std::string::size_type begin_index, end_index;
234   begin_index = 0;
235 
236   for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
237     end_index = full.find_first_of(delim, begin_index);
238     if (end_index == std::string::npos) {
239       *result++ = std::string(full.substr(begin_index));
240       return;
241     }
242     *result++ =
243         std::string(full.substr(begin_index, (end_index - begin_index)));
244     begin_index = end_index + 1;
245   }
246   *result++ = std::string(full.substr(begin_index));
247 }
248 
SplitStringAllowEmpty(StringPiece full,const char * delim,std::vector<std::string> * result)249 void SplitStringAllowEmpty(StringPiece full, const char *delim,
250                            std::vector<std::string> *result) {
251   std::back_insert_iterator<std::vector<std::string> > it(*result);
252   SplitStringToIteratorAllowEmpty(full, delim, 0, it);
253 }
254 
255 // ----------------------------------------------------------------------
256 // JoinStrings()
257 //    This merges a vector of string components with delim inserted
258 //    as separaters between components.
259 //
260 // ----------------------------------------------------------------------
261 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,std::string * result)262 static void JoinStringsIterator(const ITERATOR &start, const ITERATOR &end,
263                                 const char *delim, std::string *result) {
264   GOOGLE_CHECK(result != nullptr);
265   result->clear();
266   int delim_length = strlen(delim);
267 
268   // Precompute resulting length so we can reserve() memory in one shot.
269   int length = 0;
270   for (ITERATOR iter = start; iter != end; ++iter) {
271     if (iter != start) {
272       length += delim_length;
273     }
274     length += iter->size();
275   }
276   result->reserve(length);
277 
278   // Now combine everything.
279   for (ITERATOR iter = start; iter != end; ++iter) {
280     if (iter != start) {
281       result->append(delim, delim_length);
282     }
283     result->append(iter->data(), iter->size());
284   }
285 }
286 
JoinStrings(const std::vector<std::string> & components,const char * delim,std::string * result)287 void JoinStrings(const std::vector<std::string> &components, const char *delim,
288                  std::string *result) {
289   JoinStringsIterator(components.begin(), components.end(), delim, result);
290 }
291 
292 // ----------------------------------------------------------------------
293 // UnescapeCEscapeSequences()
294 //    This does all the unescaping that C does: \ooo, \r, \n, etc
295 //    Returns length of resulting string.
296 //    The implementation of \x parses any positive number of hex digits,
297 //    but it is an error if the value requires more than 8 bits, and the
298 //    result is truncated to 8 bits.
299 //
300 //    The second call stores its errors in a supplied string vector.
301 //    If the string vector pointer is nullptr, it reports the errors with LOG().
302 // ----------------------------------------------------------------------
303 
304 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
305 
306 // Protocol buffers doesn't ever care about errors, but I don't want to remove
307 // the code.
308 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
309 
UnescapeCEscapeSequences(const char * source,char * dest)310 int UnescapeCEscapeSequences(const char* source, char* dest) {
311   return UnescapeCEscapeSequences(source, dest, nullptr);
312 }
313 
UnescapeCEscapeSequences(const char * source,char * dest,std::vector<std::string> * errors)314 int UnescapeCEscapeSequences(const char *source, char *dest,
315                              std::vector<std::string> *errors) {
316   GOOGLE_DCHECK(errors == nullptr) << "Error reporting not implemented.";
317 
318   char* d = dest;
319   const char* p = source;
320 
321   // Small optimization for case where source = dest and there's no escaping
322   while ( p == d && *p != '\0' && *p != '\\' )
323     p++, d++;
324 
325   while (*p != '\0') {
326     if (*p != '\\') {
327       *d++ = *p++;
328     } else {
329       switch ( *++p ) {                    // skip past the '\\'
330         case '\0':
331           LOG_STRING(ERROR, errors) << "String cannot end with \\";
332           *d = '\0';
333           return d - dest;   // we're done with p
334         case 'a':  *d++ = '\a';  break;
335         case 'b':  *d++ = '\b';  break;
336         case 'f':  *d++ = '\f';  break;
337         case 'n':  *d++ = '\n';  break;
338         case 'r':  *d++ = '\r';  break;
339         case 't':  *d++ = '\t';  break;
340         case 'v':  *d++ = '\v';  break;
341         case '\\': *d++ = '\\';  break;
342         case '?':  *d++ = '\?';  break;    // \?  Who knew?
343         case '\'': *d++ = '\'';  break;
344         case '"':  *d++ = '\"';  break;
345         case '0': case '1': case '2': case '3':  // octal digit: 1 to 3 digits
346         case '4': case '5': case '6': case '7': {
347           char ch = *p - '0';
348           if ( IS_OCTAL_DIGIT(p[1]) )
349             ch = ch * 8 + *++p - '0';
350           if ( IS_OCTAL_DIGIT(p[1]) )      // safe (and easy) to do this twice
351             ch = ch * 8 + *++p - '0';      // now points at last digit
352           *d++ = ch;
353           break;
354         }
355         case 'x': case 'X': {
356           if (!isxdigit(p[1])) {
357             if (p[1] == '\0') {
358               LOG_STRING(ERROR, errors) << "String cannot end with \\x";
359             } else {
360               LOG_STRING(ERROR, errors) <<
361                 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
362             }
363             break;
364           }
365           unsigned int ch = 0;
366           const char *hex_start = p;
367           while (isxdigit(p[1]))  // arbitrarily many hex digits
368             ch = (ch << 4) + hex_digit_to_int(*++p);
369           if (ch > 0xFF)
370             LOG_STRING(ERROR, errors)
371                 << "Value of "
372                 << "\\" << std::string(hex_start, p + 1 - hex_start)
373                 << " exceeds 8 bits";
374           *d++ = ch;
375           break;
376         }
377 #if 0  // TODO(kenton):  Support \u and \U?  Requires runetochar().
378         case 'u': {
379           // \uhhhh => convert 4 hex digits to UTF-8
380           char32 rune = 0;
381           const char *hex_start = p;
382           for (int i = 0; i < 4; ++i) {
383             if (isxdigit(p[1])) {  // Look one char ahead.
384               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
385             } else {
386               LOG_STRING(ERROR, errors)
387                 << "\\u must be followed by 4 hex digits: \\"
388                 <<  std::string(hex_start, p+1-hex_start);
389               break;
390             }
391           }
392           d += runetochar(d, &rune);
393           break;
394         }
395         case 'U': {
396           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
397           char32 rune = 0;
398           const char *hex_start = p;
399           for (int i = 0; i < 8; ++i) {
400             if (isxdigit(p[1])) {  // Look one char ahead.
401               // Don't change rune until we're sure this
402               // is within the Unicode limit, but do advance p.
403               char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
404               if (newrune > 0x10FFFF) {
405                 LOG_STRING(ERROR, errors)
406                   << "Value of \\"
407                   << std::string(hex_start, p + 1 - hex_start)
408                   << " exceeds Unicode limit (0x10FFFF)";
409                 break;
410               } else {
411                 rune = newrune;
412               }
413             } else {
414               LOG_STRING(ERROR, errors)
415                 << "\\U must be followed by 8 hex digits: \\"
416                 <<  std::string(hex_start, p+1-hex_start);
417               break;
418             }
419           }
420           d += runetochar(d, &rune);
421           break;
422         }
423 #endif
424         default:
425           LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
426       }
427       p++;                                 // read past letter we escaped
428     }
429   }
430   *d = '\0';
431   return d - dest;
432 }
433 
434 // ----------------------------------------------------------------------
435 // UnescapeCEscapeString()
436 //    This does the same thing as UnescapeCEscapeSequences, but creates
437 //    a new string. The caller does not need to worry about allocating
438 //    a dest buffer. This should be used for non performance critical
439 //    tasks such as printing debug messages. It is safe for src and dest
440 //    to be the same.
441 //
442 //    The second call stores its errors in a supplied string vector.
443 //    If the string vector pointer is nullptr, it reports the errors with LOG().
444 //
445 //    In the first and second calls, the length of dest is returned. In the
446 //    the third call, the new string is returned.
447 // ----------------------------------------------------------------------
UnescapeCEscapeString(const std::string & src,std::string * dest)448 int UnescapeCEscapeString(const std::string &src, std::string *dest) {
449   return UnescapeCEscapeString(src, dest, nullptr);
450 }
451 
UnescapeCEscapeString(const std::string & src,std::string * dest,std::vector<std::string> * errors)452 int UnescapeCEscapeString(const std::string &src, std::string *dest,
453                           std::vector<std::string> *errors) {
454   std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
455   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
456   GOOGLE_CHECK(dest);
457   dest->assign(unescaped.get(), len);
458   return len;
459 }
460 
UnescapeCEscapeString(const std::string & src)461 std::string UnescapeCEscapeString(const std::string &src) {
462   std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
463   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
464   return std::string(unescaped.get(), len);
465 }
466 
467 // ----------------------------------------------------------------------
468 // CEscapeString()
469 // CHexEscapeString()
470 //    Copies 'src' to 'dest', escaping dangerous characters using
471 //    C-style escape sequences. This is very useful for preparing query
472 //    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
473 //    hexadecimal rather than octal sequences.
474 //    Returns the number of bytes written to 'dest' (not including the \0)
475 //    or -1 if there was insufficient space.
476 //
477 //    Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
478 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)479 int CEscapeInternal(const char* src, int src_len, char* dest,
480                     int dest_len, bool use_hex, bool utf8_safe) {
481   const char* src_end = src + src_len;
482   int used = 0;
483   bool last_hex_escape = false; // true if last output char was \xNN
484 
485   for (; src < src_end; src++) {
486     if (dest_len - used < 2)   // Need space for two letter escape
487       return -1;
488 
489     bool is_hex_escape = false;
490     switch (*src) {
491       case '\n': dest[used++] = '\\'; dest[used++] = 'n';  break;
492       case '\r': dest[used++] = '\\'; dest[used++] = 'r';  break;
493       case '\t': dest[used++] = '\\'; dest[used++] = 't';  break;
494       case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
495       case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
496       case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
497       default:
498         // Note that if we emit \xNN and the src character after that is a hex
499         // digit then that digit must be escaped too to prevent it being
500         // interpreted as part of the character code by C.
501         if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
502             (!isprint(*src) ||
503              (last_hex_escape && isxdigit(*src)))) {
504           if (dest_len - used < 4) // need space for 4 letter escape
505             return -1;
506           sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
507                   static_cast<uint8>(*src));
508           is_hex_escape = use_hex;
509           used += 4;
510         } else {
511           dest[used++] = *src; break;
512         }
513     }
514     last_hex_escape = is_hex_escape;
515   }
516 
517   if (dest_len - used < 1)   // make sure that there is room for \0
518     return -1;
519 
520   dest[used] = '\0';   // doesn't count towards return value though
521   return used;
522 }
523 
524 // Calculates the length of the C-style escaped version of 'src'.
525 // Assumes that non-printable characters are escaped using octal sequences, and
526 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)527 static inline size_t CEscapedLength(StringPiece src) {
528   static char c_escaped_len[256] = {
529     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
530     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
531     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
532     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
533     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
534     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
535     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
536     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
537     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
538     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
539     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
540     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
541     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
542     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
543     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
544     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
545   };
546 
547   size_t escaped_len = 0;
548   for (int i = 0; i < src.size(); ++i) {
549     unsigned char c = static_cast<unsigned char>(src[i]);
550     escaped_len += c_escaped_len[c];
551   }
552   return escaped_len;
553 }
554 
555 // ----------------------------------------------------------------------
556 // Escapes 'src' using C-style escape sequences, and appends the escaped string
557 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
558 // the required space using a lookup table, and also does not do any special
559 // handling for Hex or UTF-8 characters.
560 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,std::string * dest)561 void CEscapeAndAppend(StringPiece src, std::string *dest) {
562   size_t escaped_len = CEscapedLength(src);
563   if (escaped_len == src.size()) {
564     dest->append(src.data(), src.size());
565     return;
566   }
567 
568   size_t cur_dest_len = dest->size();
569   dest->resize(cur_dest_len + escaped_len);
570   char* append_ptr = &(*dest)[cur_dest_len];
571 
572   for (int i = 0; i < src.size(); ++i) {
573     unsigned char c = static_cast<unsigned char>(src[i]);
574     switch (c) {
575       case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
576       case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
577       case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
578       case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
579       case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
580       case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
581       default:
582         if (!isprint(c)) {
583           *append_ptr++ = '\\';
584           *append_ptr++ = '0' + c / 64;
585           *append_ptr++ = '0' + (c % 64) / 8;
586           *append_ptr++ = '0' + c % 8;
587         } else {
588           *append_ptr++ = c;
589         }
590         break;
591     }
592   }
593 }
594 
CEscape(const std::string & src)595 std::string CEscape(const std::string &src) {
596   std::string dest;
597   CEscapeAndAppend(src, &dest);
598   return dest;
599 }
600 
601 namespace strings {
602 
Utf8SafeCEscape(const std::string & src)603 std::string Utf8SafeCEscape(const std::string &src) {
604   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
605   std::unique_ptr<char[]> dest(new char[dest_length]);
606   const int len = CEscapeInternal(src.data(), src.size(),
607                                   dest.get(), dest_length, false, true);
608   GOOGLE_DCHECK_GE(len, 0);
609   return std::string(dest.get(), len);
610 }
611 
CHexEscape(const std::string & src)612 std::string CHexEscape(const std::string &src) {
613   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
614   std::unique_ptr<char[]> dest(new char[dest_length]);
615   const int len = CEscapeInternal(src.data(), src.size(),
616                                   dest.get(), dest_length, true, false);
617   GOOGLE_DCHECK_GE(len, 0);
618   return std::string(dest.get(), len);
619 }
620 
621 }  // namespace strings
622 
623 // ----------------------------------------------------------------------
624 // strto32_adaptor()
625 // strtou32_adaptor()
626 //    Implementation of strto[u]l replacements that have identical
627 //    overflow and underflow characteristics for both ILP-32 and LP-64
628 //    platforms, including errno preservation in error-free calls.
629 // ----------------------------------------------------------------------
630 
strto32_adaptor(const char * nptr,char ** endptr,int base)631 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
632   const int saved_errno = errno;
633   errno = 0;
634   const long result = strtol(nptr, endptr, base);
635   if (errno == ERANGE && result == LONG_MIN) {
636     return kint32min;
637   } else if (errno == ERANGE && result == LONG_MAX) {
638     return kint32max;
639   } else if (errno == 0 && result < kint32min) {
640     errno = ERANGE;
641     return kint32min;
642   } else if (errno == 0 && result > kint32max) {
643     errno = ERANGE;
644     return kint32max;
645   }
646   if (errno == 0)
647     errno = saved_errno;
648   return static_cast<int32>(result);
649 }
650 
strtou32_adaptor(const char * nptr,char ** endptr,int base)651 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
652   const int saved_errno = errno;
653   errno = 0;
654   const unsigned long result = strtoul(nptr, endptr, base);
655   if (errno == ERANGE && result == ULONG_MAX) {
656     return kuint32max;
657   } else if (errno == 0 && result > kuint32max) {
658     errno = ERANGE;
659     return kuint32max;
660   }
661   if (errno == 0)
662     errno = saved_errno;
663   return static_cast<uint32>(result);
664 }
665 
safe_parse_sign(std::string * text,bool * negative_ptr)666 inline bool safe_parse_sign(std::string *text /*inout*/,
667                             bool *negative_ptr /*output*/) {
668   const char* start = text->data();
669   const char* end = start + text->size();
670 
671   // Consume whitespace.
672   while (start < end && (start[0] == ' ')) {
673     ++start;
674   }
675   while (start < end && (end[-1] == ' ')) {
676     --end;
677   }
678   if (start >= end) {
679     return false;
680   }
681 
682   // Consume sign.
683   *negative_ptr = (start[0] == '-');
684   if (*negative_ptr || start[0] == '+') {
685     ++start;
686     if (start >= end) {
687       return false;
688     }
689   }
690   *text = text->substr(start - text->data(), end - start);
691   return true;
692 }
693 
694 template <typename IntType>
safe_parse_positive_int(std::string text,IntType * value_p)695 bool safe_parse_positive_int(std::string text, IntType *value_p) {
696   int base = 10;
697   IntType value = 0;
698   const IntType vmax = std::numeric_limits<IntType>::max();
699   assert(vmax > 0);
700   assert(vmax >= base);
701   const IntType vmax_over_base = vmax / base;
702   const char* start = text.data();
703   const char* end = start + text.size();
704   // loop over digits
705   for (; start < end; ++start) {
706     unsigned char c = static_cast<unsigned char>(start[0]);
707     int digit = c - '0';
708     if (digit >= base || digit < 0) {
709       *value_p = value;
710       return false;
711     }
712     if (value > vmax_over_base) {
713       *value_p = vmax;
714       return false;
715     }
716     value *= base;
717     if (value > vmax - digit) {
718       *value_p = vmax;
719       return false;
720     }
721     value += digit;
722   }
723   *value_p = value;
724   return true;
725 }
726 
727 template <typename IntType>
safe_parse_negative_int(const std::string & text,IntType * value_p)728 bool safe_parse_negative_int(const std::string &text, IntType *value_p) {
729   int base = 10;
730   IntType value = 0;
731   const IntType vmin = std::numeric_limits<IntType>::min();
732   assert(vmin < 0);
733   assert(vmin <= 0 - base);
734   IntType vmin_over_base = vmin / base;
735   // 2003 c++ standard [expr.mul]
736   // "... the sign of the remainder is implementation-defined."
737   // Although (vmin/base)*base + vmin%base is always vmin.
738   // 2011 c++ standard tightens the spec but we cannot rely on it.
739   if (vmin % base > 0) {
740     vmin_over_base += 1;
741   }
742   const char* start = text.data();
743   const char* end = start + text.size();
744   // loop over digits
745   for (; start < end; ++start) {
746     unsigned char c = static_cast<unsigned char>(start[0]);
747     int digit = c - '0';
748     if (digit >= base || digit < 0) {
749       *value_p = value;
750       return false;
751     }
752     if (value < vmin_over_base) {
753       *value_p = vmin;
754       return false;
755     }
756     value *= base;
757     if (value < vmin + digit) {
758       *value_p = vmin;
759       return false;
760     }
761     value -= digit;
762   }
763   *value_p = value;
764   return true;
765 }
766 
767 template <typename IntType>
safe_int_internal(std::string text,IntType * value_p)768 bool safe_int_internal(std::string text, IntType *value_p) {
769   *value_p = 0;
770   bool negative;
771   if (!safe_parse_sign(&text, &negative)) {
772     return false;
773   }
774   if (!negative) {
775     return safe_parse_positive_int(text, value_p);
776   } else {
777     return safe_parse_negative_int(text, value_p);
778   }
779 }
780 
781 template <typename IntType>
safe_uint_internal(std::string text,IntType * value_p)782 bool safe_uint_internal(std::string text, IntType *value_p) {
783   *value_p = 0;
784   bool negative;
785   if (!safe_parse_sign(&text, &negative) || negative) {
786     return false;
787   }
788   return safe_parse_positive_int(text, value_p);
789 }
790 
791 // ----------------------------------------------------------------------
792 // FastIntToBuffer()
793 // FastInt64ToBuffer()
794 // FastHexToBuffer()
795 // FastHex64ToBuffer()
796 // FastHex32ToBuffer()
797 // ----------------------------------------------------------------------
798 
799 // Offset into buffer where FastInt64ToBuffer places the end of string
800 // null character.  Also used by FastInt64ToBufferLeft.
801 static const int kFastInt64ToBufferOffset = 21;
802 
FastInt64ToBuffer(int64 i,char * buffer)803 char *FastInt64ToBuffer(int64 i, char* buffer) {
804   // We could collapse the positive and negative sections, but that
805   // would be slightly slower for positive numbers...
806   // 22 bytes is enough to store -2**64, -18446744073709551616.
807   char* p = buffer + kFastInt64ToBufferOffset;
808   *p-- = '\0';
809   if (i >= 0) {
810     do {
811       *p-- = '0' + i % 10;
812       i /= 10;
813     } while (i > 0);
814     return p + 1;
815   } else {
816     // On different platforms, % and / have different behaviors for
817     // negative numbers, so we need to jump through hoops to make sure
818     // we don't divide negative numbers.
819     if (i > -10) {
820       i = -i;
821       *p-- = '0' + i;
822       *p = '-';
823       return p;
824     } else {
825       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
826       i = i + 10;
827       i = -i;
828       *p-- = '0' + i % 10;
829       // Undo what we did a moment ago
830       i = i / 10 + 1;
831       do {
832         *p-- = '0' + i % 10;
833         i /= 10;
834       } while (i > 0);
835       *p = '-';
836       return p;
837     }
838   }
839 }
840 
841 // Offset into buffer where FastInt32ToBuffer places the end of string
842 // null character.  Also used by FastInt32ToBufferLeft
843 static const int kFastInt32ToBufferOffset = 11;
844 
845 // Yes, this is a duplicate of FastInt64ToBuffer.  But, we need this for the
846 // compiler to generate 32 bit arithmetic instructions.  It's much faster, at
847 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)848 char *FastInt32ToBuffer(int32 i, char* buffer) {
849   // We could collapse the positive and negative sections, but that
850   // would be slightly slower for positive numbers...
851   // 12 bytes is enough to store -2**32, -4294967296.
852   char* p = buffer + kFastInt32ToBufferOffset;
853   *p-- = '\0';
854   if (i >= 0) {
855     do {
856       *p-- = '0' + i % 10;
857       i /= 10;
858     } while (i > 0);
859     return p + 1;
860   } else {
861     // On different platforms, % and / have different behaviors for
862     // negative numbers, so we need to jump through hoops to make sure
863     // we don't divide negative numbers.
864     if (i > -10) {
865       i = -i;
866       *p-- = '0' + i;
867       *p = '-';
868       return p;
869     } else {
870       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
871       i = i + 10;
872       i = -i;
873       *p-- = '0' + i % 10;
874       // Undo what we did a moment ago
875       i = i / 10 + 1;
876       do {
877         *p-- = '0' + i % 10;
878         i /= 10;
879       } while (i > 0);
880       *p = '-';
881       return p;
882     }
883   }
884 }
885 
FastHexToBuffer(int i,char * buffer)886 char *FastHexToBuffer(int i, char* buffer) {
887   GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
888 
889   static const char *hexdigits = "0123456789abcdef";
890   char *p = buffer + 21;
891   *p-- = '\0';
892   do {
893     *p-- = hexdigits[i & 15];   // mod by 16
894     i >>= 4;                    // divide by 16
895   } while (i > 0);
896   return p + 1;
897 }
898 
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)899 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
900   static const char *hexdigits = "0123456789abcdef";
901   buffer[num_byte] = '\0';
902   for (int i = num_byte - 1; i >= 0; i--) {
903 #ifdef _M_X64
904     // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
905     // block. Given that the uint32 cast was to improve performance on 32-bit
906     // platforms, we use 64-bit '&' directly.
907     buffer[i] = hexdigits[value & 0xf];
908 #else
909     buffer[i] = hexdigits[uint32(value) & 0xf];
910 #endif
911     value >>= 4;
912   }
913   return buffer;
914 }
915 
FastHex64ToBuffer(uint64 value,char * buffer)916 char *FastHex64ToBuffer(uint64 value, char* buffer) {
917   return InternalFastHexToBuffer(value, buffer, 16);
918 }
919 
FastHex32ToBuffer(uint32 value,char * buffer)920 char *FastHex32ToBuffer(uint32 value, char* buffer) {
921   return InternalFastHexToBuffer(value, buffer, 8);
922 }
923 
924 // ----------------------------------------------------------------------
925 // FastInt32ToBufferLeft()
926 // FastUInt32ToBufferLeft()
927 // FastInt64ToBufferLeft()
928 // FastUInt64ToBufferLeft()
929 //
930 // Like the Fast*ToBuffer() functions above, these are intended for speed.
931 // Unlike the Fast*ToBuffer() functions, however, these functions write
932 // their output to the beginning of the buffer (hence the name, as the
933 // output is left-aligned).  The caller is responsible for ensuring that
934 // the buffer has enough space to hold the output.
935 //
936 // Returns a pointer to the end of the string (i.e. the null character
937 // terminating the string).
938 // ----------------------------------------------------------------------
939 
940 static const char two_ASCII_digits[100][2] = {
941   {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
942   {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
943   {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
944   {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
945   {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
946   {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
947   {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
948   {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
949   {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
950   {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
951   {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
952   {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
953   {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
954   {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
955   {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
956   {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
957   {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
958   {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
959   {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
960   {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
961 };
962 
FastUInt32ToBufferLeft(uint32 u,char * buffer)963 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
964   uint32 digits;
965   const char *ASCII_digits = nullptr;
966   // The idea of this implementation is to trim the number of divides to as few
967   // as possible by using multiplication and subtraction rather than mod (%),
968   // and by outputting two digits at a time rather than one.
969   // The huge-number case is first, in the hopes that the compiler will output
970   // that case in one branch-free block of code, and only output conditional
971   // branches into it from below.
972   if (u >= 1000000000) {  // >= 1,000,000,000
973     digits = u / 100000000;  // 100,000,000
974     ASCII_digits = two_ASCII_digits[digits];
975     buffer[0] = ASCII_digits[0];
976     buffer[1] = ASCII_digits[1];
977     buffer += 2;
978 sublt100_000_000:
979     u -= digits * 100000000;  // 100,000,000
980 lt100_000_000:
981     digits = u / 1000000;  // 1,000,000
982     ASCII_digits = two_ASCII_digits[digits];
983     buffer[0] = ASCII_digits[0];
984     buffer[1] = ASCII_digits[1];
985     buffer += 2;
986 sublt1_000_000:
987     u -= digits * 1000000;  // 1,000,000
988 lt1_000_000:
989     digits = u / 10000;  // 10,000
990     ASCII_digits = two_ASCII_digits[digits];
991     buffer[0] = ASCII_digits[0];
992     buffer[1] = ASCII_digits[1];
993     buffer += 2;
994 sublt10_000:
995     u -= digits * 10000;  // 10,000
996 lt10_000:
997     digits = u / 100;
998     ASCII_digits = two_ASCII_digits[digits];
999     buffer[0] = ASCII_digits[0];
1000     buffer[1] = ASCII_digits[1];
1001     buffer += 2;
1002 sublt100:
1003     u -= digits * 100;
1004 lt100:
1005     digits = u;
1006     ASCII_digits = two_ASCII_digits[digits];
1007     buffer[0] = ASCII_digits[0];
1008     buffer[1] = ASCII_digits[1];
1009     buffer += 2;
1010 done:
1011     *buffer = 0;
1012     return buffer;
1013   }
1014 
1015   if (u < 100) {
1016     digits = u;
1017     if (u >= 10) goto lt100;
1018     *buffer++ = '0' + digits;
1019     goto done;
1020   }
1021   if (u  <  10000) {   // 10,000
1022     if (u >= 1000) goto lt10_000;
1023     digits = u / 100;
1024     *buffer++ = '0' + digits;
1025     goto sublt100;
1026   }
1027   if (u  <  1000000) {   // 1,000,000
1028     if (u >= 100000) goto lt1_000_000;
1029     digits = u / 10000;  //    10,000
1030     *buffer++ = '0' + digits;
1031     goto sublt10_000;
1032   }
1033   if (u  <  100000000) {   // 100,000,000
1034     if (u >= 10000000) goto lt100_000_000;
1035     digits = u / 1000000;  //   1,000,000
1036     *buffer++ = '0' + digits;
1037     goto sublt1_000_000;
1038   }
1039   // we already know that u < 1,000,000,000
1040   digits = u / 100000000;   // 100,000,000
1041   *buffer++ = '0' + digits;
1042   goto sublt100_000_000;
1043 }
1044 
FastInt32ToBufferLeft(int32 i,char * buffer)1045 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1046   uint32 u = 0;
1047   if (i < 0) {
1048     *buffer++ = '-';
1049     u -= i;
1050   } else {
1051     u = i;
1052   }
1053   return FastUInt32ToBufferLeft(u, buffer);
1054 }
1055 
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1056 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1057   int digits;
1058   const char *ASCII_digits = nullptr;
1059 
1060   uint32 u = static_cast<uint32>(u64);
1061   if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1062 
1063   uint64 top_11_digits = u64 / 1000000000;
1064   buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1065   u = u64 - (top_11_digits * 1000000000);
1066 
1067   digits = u / 10000000;  // 10,000,000
1068   GOOGLE_DCHECK_LT(digits, 100);
1069   ASCII_digits = two_ASCII_digits[digits];
1070   buffer[0] = ASCII_digits[0];
1071   buffer[1] = ASCII_digits[1];
1072   buffer += 2;
1073   u -= digits * 10000000;  // 10,000,000
1074   digits = u / 100000;  // 100,000
1075   ASCII_digits = two_ASCII_digits[digits];
1076   buffer[0] = ASCII_digits[0];
1077   buffer[1] = ASCII_digits[1];
1078   buffer += 2;
1079   u -= digits * 100000;  // 100,000
1080   digits = u / 1000;  // 1,000
1081   ASCII_digits = two_ASCII_digits[digits];
1082   buffer[0] = ASCII_digits[0];
1083   buffer[1] = ASCII_digits[1];
1084   buffer += 2;
1085   u -= digits * 1000;  // 1,000
1086   digits = u / 10;
1087   ASCII_digits = two_ASCII_digits[digits];
1088   buffer[0] = ASCII_digits[0];
1089   buffer[1] = ASCII_digits[1];
1090   buffer += 2;
1091   u -= digits * 10;
1092   digits = u;
1093   *buffer++ = '0' + digits;
1094   *buffer = 0;
1095   return buffer;
1096 }
1097 
FastInt64ToBufferLeft(int64 i,char * buffer)1098 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1099   uint64 u = 0;
1100   if (i < 0) {
1101     *buffer++ = '-';
1102     u -= i;
1103   } else {
1104     u = i;
1105   }
1106   return FastUInt64ToBufferLeft(u, buffer);
1107 }
1108 
1109 // ----------------------------------------------------------------------
1110 // SimpleItoa()
1111 //    Description: converts an integer to a string.
1112 //
1113 //    Return value: string
1114 // ----------------------------------------------------------------------
1115 
SimpleItoa(int i)1116 std::string SimpleItoa(int i) {
1117   char buffer[kFastToBufferSize];
1118   return (sizeof(i) == 4) ?
1119     FastInt32ToBuffer(i, buffer) :
1120     FastInt64ToBuffer(i, buffer);
1121 }
1122 
SimpleItoa(unsigned int i)1123 std::string SimpleItoa(unsigned int i) {
1124   char buffer[kFastToBufferSize];
1125   return std::string(buffer, (sizeof(i) == 4)
1126                                  ? FastUInt32ToBufferLeft(i, buffer)
1127                                  : FastUInt64ToBufferLeft(i, buffer));
1128 }
1129 
SimpleItoa(long i)1130 std::string SimpleItoa(long i) {
1131   char buffer[kFastToBufferSize];
1132   return (sizeof(i) == 4) ?
1133     FastInt32ToBuffer(i, buffer) :
1134     FastInt64ToBuffer(i, buffer);
1135 }
1136 
SimpleItoa(unsigned long i)1137 std::string SimpleItoa(unsigned long i) {
1138   char buffer[kFastToBufferSize];
1139   return std::string(buffer, (sizeof(i) == 4)
1140                                  ? FastUInt32ToBufferLeft(i, buffer)
1141                                  : FastUInt64ToBufferLeft(i, buffer));
1142 }
1143 
SimpleItoa(long long i)1144 std::string SimpleItoa(long long i) {
1145   char buffer[kFastToBufferSize];
1146   return (sizeof(i) == 4) ?
1147     FastInt32ToBuffer(i, buffer) :
1148     FastInt64ToBuffer(i, buffer);
1149 }
1150 
SimpleItoa(unsigned long long i)1151 std::string SimpleItoa(unsigned long long i) {
1152   char buffer[kFastToBufferSize];
1153   return std::string(buffer, (sizeof(i) == 4)
1154                                  ? FastUInt32ToBufferLeft(i, buffer)
1155                                  : FastUInt64ToBufferLeft(i, buffer));
1156 }
1157 
1158 // ----------------------------------------------------------------------
1159 // SimpleDtoa()
1160 // SimpleFtoa()
1161 // DoubleToBuffer()
1162 // FloatToBuffer()
1163 //    We want to print the value without losing precision, but we also do
1164 //    not want to print more digits than necessary.  This turns out to be
1165 //    trickier than it sounds.  Numbers like 0.2 cannot be represented
1166 //    exactly in binary.  If we print 0.2 with a very large precision,
1167 //    e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1168 //    On the other hand, if we set the precision too low, we lose
1169 //    significant digits when printing numbers that actually need them.
1170 //    It turns out there is no precision value that does the right thing
1171 //    for all numbers.
1172 //
1173 //    Our strategy is to first try printing with a precision that is never
1174 //    over-precise, then parse the result with strtod() to see if it
1175 //    matches.  If not, we print again with a precision that will always
1176 //    give a precise result, but may use more digits than necessary.
1177 //
1178 //    An arguably better strategy would be to use the algorithm described
1179 //    in "How to Print Floating-Point Numbers Accurately" by Steele &
1180 //    White, e.g. as implemented by David M. Gay's dtoa().  It turns out,
1181 //    however, that the following implementation is about as fast as
1182 //    DMG's code.  Furthermore, DMG's code locks mutexes, which means it
1183 //    will not scale well on multi-core machines.  DMG's code is slightly
1184 //    more accurate (in that it will never use more digits than
1185 //    necessary), but this is probably irrelevant for most users.
1186 //
1187 //    Rob Pike and Ken Thompson also have an implementation of dtoa() in
1188 //    third_party/fmt/fltfmt.cc.  Their implementation is similar to this
1189 //    one in that it makes guesses and then uses strtod() to check them.
1190 //    Their implementation is faster because they use their own code to
1191 //    generate the digits in the first place rather than use snprintf(),
1192 //    thus avoiding format string parsing overhead.  However, this makes
1193 //    it considerably more complicated than the following implementation,
1194 //    and it is embedded in a larger library.  If speed turns out to be
1195 //    an issue, we could re-implement this in terms of their
1196 //    implementation.
1197 // ----------------------------------------------------------------------
1198 
SimpleDtoa(double value)1199 std::string SimpleDtoa(double value) {
1200   char buffer[kDoubleToBufferSize];
1201   return DoubleToBuffer(value, buffer);
1202 }
1203 
SimpleFtoa(float value)1204 std::string SimpleFtoa(float value) {
1205   char buffer[kFloatToBufferSize];
1206   return FloatToBuffer(value, buffer);
1207 }
1208 
IsValidFloatChar(char c)1209 static inline bool IsValidFloatChar(char c) {
1210   return ('0' <= c && c <= '9') ||
1211          c == 'e' || c == 'E' ||
1212          c == '+' || c == '-';
1213 }
1214 
DelocalizeRadix(char * buffer)1215 void DelocalizeRadix(char* buffer) {
1216   // Fast check:  if the buffer has a normal decimal point, assume no
1217   // translation is needed.
1218   if (strchr(buffer, '.') != nullptr) return;
1219 
1220   // Find the first unknown character.
1221   while (IsValidFloatChar(*buffer)) ++buffer;
1222 
1223   if (*buffer == '\0') {
1224     // No radix character found.
1225     return;
1226   }
1227 
1228   // We are now pointing at the locale-specific radix character.  Replace it
1229   // with '.'.
1230   *buffer = '.';
1231   ++buffer;
1232 
1233   if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1234     // It appears the radix was a multi-byte character.  We need to remove the
1235     // extra bytes.
1236     char* target = buffer;
1237     do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1238     memmove(target, buffer, strlen(buffer) + 1);
1239   }
1240 }
1241 
DoubleToBuffer(double value,char * buffer)1242 char* DoubleToBuffer(double value, char* buffer) {
1243   // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1244   // platforms these days.  Just in case some system exists where DBL_DIG
1245   // is significantly larger -- and risks overflowing our buffer -- we have
1246   // this assert.
1247   GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1248 
1249   if (value == std::numeric_limits<double>::infinity()) {
1250     strcpy(buffer, "inf");
1251     return buffer;
1252   } else if (value == -std::numeric_limits<double>::infinity()) {
1253     strcpy(buffer, "-inf");
1254     return buffer;
1255   } else if (std::isnan(value)) {
1256     strcpy(buffer, "nan");
1257     return buffer;
1258   }
1259 
1260   int snprintf_result =
1261     snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1262 
1263   // The snprintf should never overflow because the buffer is significantly
1264   // larger than the precision we asked for.
1265   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1266 
1267   // We need to make parsed_value volatile in order to force the compiler to
1268   // write it out to the stack.  Otherwise, it may keep the value in a
1269   // register, and if it does that, it may keep it as a long double instead
1270   // of a double.  This long double may have extra bits that make it compare
1271   // unequal to "value" even though it would be exactly equal if it were
1272   // truncated to a double.
1273   volatile double parsed_value = internal::NoLocaleStrtod(buffer, nullptr);
1274   if (parsed_value != value) {
1275     int snprintf_result =
1276       snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1277 
1278     // Should never overflow; see above.
1279     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1280   }
1281 
1282   DelocalizeRadix(buffer);
1283   return buffer;
1284 }
1285 
memcasecmp(const char * s1,const char * s2,size_t len)1286 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1287   const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1288   const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1289 
1290   for ( int i = 0; i < len; i++ ) {
1291     const int diff =
1292       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1293       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1294     if (diff != 0) return diff;
1295   }
1296   return 0;
1297 }
1298 
CaseEqual(StringPiece s1,StringPiece s2)1299 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1300   if (s1.size() != s2.size()) return false;
1301   return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1302 }
1303 
safe_strtob(StringPiece str,bool * value)1304 bool safe_strtob(StringPiece str, bool* value) {
1305   GOOGLE_CHECK(value != nullptr) << "nullptr output boolean given.";
1306   if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1307       CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1308       CaseEqual(str, "1")) {
1309     *value = true;
1310     return true;
1311   }
1312   if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1313       CaseEqual(str, "no") || CaseEqual(str, "n") ||
1314       CaseEqual(str, "0")) {
1315     *value = false;
1316     return true;
1317   }
1318   return false;
1319 }
1320 
safe_strtof(const char * str,float * value)1321 bool safe_strtof(const char* str, float* value) {
1322   char* endptr;
1323   errno = 0;  // errno only gets set on errors
1324 #if defined(_WIN32) || defined (__hpux)  // has no strtof()
1325   *value = internal::NoLocaleStrtod(str, &endptr);
1326 #else
1327   *value = strtof(str, &endptr);
1328 #endif
1329   return *str != 0 && *endptr == 0 && errno == 0;
1330 }
1331 
safe_strtod(const char * str,double * value)1332 bool safe_strtod(const char* str, double* value) {
1333   char* endptr;
1334   *value = internal::NoLocaleStrtod(str, &endptr);
1335   if (endptr != str) {
1336     while (ascii_isspace(*endptr)) ++endptr;
1337   }
1338   // Ignore range errors from strtod.  The values it
1339   // returns on underflow and overflow are the right
1340   // fallback in a robust setting.
1341   return *str != '\0' && *endptr == '\0';
1342 }
1343 
safe_strto32(const std::string & str,int32 * value)1344 bool safe_strto32(const std::string &str, int32 *value) {
1345   return safe_int_internal(str, value);
1346 }
1347 
safe_strtou32(const std::string & str,uint32 * value)1348 bool safe_strtou32(const std::string &str, uint32 *value) {
1349   return safe_uint_internal(str, value);
1350 }
1351 
safe_strto64(const std::string & str,int64 * value)1352 bool safe_strto64(const std::string &str, int64 *value) {
1353   return safe_int_internal(str, value);
1354 }
1355 
safe_strtou64(const std::string & str,uint64 * value)1356 bool safe_strtou64(const std::string &str, uint64 *value) {
1357   return safe_uint_internal(str, value);
1358 }
1359 
FloatToBuffer(float value,char * buffer)1360 char* FloatToBuffer(float value, char* buffer) {
1361   // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1362   // platforms these days.  Just in case some system exists where FLT_DIG
1363   // is significantly larger -- and risks overflowing our buffer -- we have
1364   // this assert.
1365   GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1366 
1367   if (value == std::numeric_limits<double>::infinity()) {
1368     strcpy(buffer, "inf");
1369     return buffer;
1370   } else if (value == -std::numeric_limits<double>::infinity()) {
1371     strcpy(buffer, "-inf");
1372     return buffer;
1373   } else if (std::isnan(value)) {
1374     strcpy(buffer, "nan");
1375     return buffer;
1376   }
1377 
1378   int snprintf_result =
1379     snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1380 
1381   // The snprintf should never overflow because the buffer is significantly
1382   // larger than the precision we asked for.
1383   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1384 
1385   float parsed_value;
1386   if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1387     int snprintf_result =
1388       snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value);
1389 
1390     // Should never overflow; see above.
1391     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1392   }
1393 
1394   DelocalizeRadix(buffer);
1395   return buffer;
1396 }
1397 
1398 namespace strings {
1399 
AlphaNum(strings::Hex hex)1400 AlphaNum::AlphaNum(strings::Hex hex) {
1401   char *const end = &digits[kFastToBufferSize];
1402   char *writer = end;
1403   uint64 value = hex.value;
1404   uint64 width = hex.spec;
1405   // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1406   // where 0x10000 is the smallest hex number that is as wide as the user
1407   // asked for.
1408   uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1409   static const char hexdigits[] = "0123456789abcdef";
1410   do {
1411     *--writer = hexdigits[value & 0xF];
1412     value >>= 4;
1413     mask >>= 4;
1414   } while (mask != 0);
1415   piece_data_ = writer;
1416   piece_size_ = end - writer;
1417 }
1418 
1419 }  // namespace strings
1420 
1421 // ----------------------------------------------------------------------
1422 // StrCat()
1423 //    This merges the given strings or integers, with no delimiter.  This
1424 //    is designed to be the fastest possible way to construct a string out
1425 //    of a mix of raw C strings, C++ strings, and integer values.
1426 // ----------------------------------------------------------------------
1427 
1428 // Append is merely a version of memcpy that returns the address of the byte
1429 // after the area just overwritten.  It comes in multiple flavors to minimize
1430 // call overhead.
Append1(char * out,const AlphaNum & x)1431 static char *Append1(char *out, const AlphaNum &x) {
1432   if (x.size() > 0) {
1433     memcpy(out, x.data(), x.size());
1434     out += x.size();
1435   }
1436   return out;
1437 }
1438 
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1439 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1440   if (x1.size() > 0) {
1441     memcpy(out, x1.data(), x1.size());
1442     out += x1.size();
1443   }
1444   if (x2.size() > 0) {
1445     memcpy(out, x2.data(), x2.size());
1446     out += x2.size();
1447   }
1448   return out;
1449 }
1450 
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1451 static char *Append4(char *out, const AlphaNum &x1, const AlphaNum &x2,
1452                      const AlphaNum &x3, const AlphaNum &x4) {
1453   if (x1.size() > 0) {
1454     memcpy(out, x1.data(), x1.size());
1455     out += x1.size();
1456   }
1457   if (x2.size() > 0) {
1458     memcpy(out, x2.data(), x2.size());
1459     out += x2.size();
1460   }
1461   if (x3.size() > 0) {
1462     memcpy(out, x3.data(), x3.size());
1463     out += x3.size();
1464   }
1465   if (x4.size() > 0) {
1466     memcpy(out, x4.data(), x4.size());
1467     out += x4.size();
1468   }
1469   return out;
1470 }
1471 
StrCat(const AlphaNum & a,const AlphaNum & b)1472 std::string StrCat(const AlphaNum &a, const AlphaNum &b) {
1473   std::string result;
1474   result.resize(a.size() + b.size());
1475   char *const begin = &*result.begin();
1476   char *out = Append2(begin, a, b);
1477   GOOGLE_DCHECK_EQ(out, begin + result.size());
1478   return result;
1479 }
1480 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1481 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1482   std::string result;
1483   result.resize(a.size() + b.size() + c.size());
1484   char *const begin = &*result.begin();
1485   char *out = Append2(begin, a, b);
1486   out = Append1(out, c);
1487   GOOGLE_DCHECK_EQ(out, begin + result.size());
1488   return result;
1489 }
1490 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1491 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1492                    const AlphaNum &d) {
1493   std::string result;
1494   result.resize(a.size() + b.size() + c.size() + d.size());
1495   char *const begin = &*result.begin();
1496   char *out = Append4(begin, a, b, c, d);
1497   GOOGLE_DCHECK_EQ(out, begin + result.size());
1498   return result;
1499 }
1500 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1501 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1502                    const AlphaNum &d, const AlphaNum &e) {
1503   std::string result;
1504   result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1505   char *const begin = &*result.begin();
1506   char *out = Append4(begin, a, b, c, d);
1507   out = Append1(out, e);
1508   GOOGLE_DCHECK_EQ(out, begin + result.size());
1509   return result;
1510 }
1511 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1512 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1513                    const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1514   std::string result;
1515   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1516                 f.size());
1517   char *const begin = &*result.begin();
1518   char *out = Append4(begin, a, b, c, d);
1519   out = Append2(out, e, f);
1520   GOOGLE_DCHECK_EQ(out, begin + result.size());
1521   return result;
1522 }
1523 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1524 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1525                    const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1526                    const AlphaNum &g) {
1527   std::string result;
1528   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1529                 f.size() + g.size());
1530   char *const begin = &*result.begin();
1531   char *out = Append4(begin, a, b, c, d);
1532   out = Append2(out, e, f);
1533   out = Append1(out, g);
1534   GOOGLE_DCHECK_EQ(out, begin + result.size());
1535   return result;
1536 }
1537 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1538 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1539                    const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1540                    const AlphaNum &g, const AlphaNum &h) {
1541   std::string result;
1542   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1543                 f.size() + g.size() + h.size());
1544   char *const begin = &*result.begin();
1545   char *out = Append4(begin, a, b, c, d);
1546   out = Append4(out, e, f, g, h);
1547   GOOGLE_DCHECK_EQ(out, begin + result.size());
1548   return result;
1549 }
1550 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1551 std::string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1552                    const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1553                    const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1554   std::string result;
1555   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1556                 f.size() + g.size() + h.size() + i.size());
1557   char *const begin = &*result.begin();
1558   char *out = Append4(begin, a, b, c, d);
1559   out = Append4(out, e, f, g, h);
1560   out = Append1(out, i);
1561   GOOGLE_DCHECK_EQ(out, begin + result.size());
1562   return result;
1563 }
1564 
1565 // It's possible to call StrAppend with a char * pointer that is partway into
1566 // the string we're appending to.  However the results of this are random.
1567 // Therefore, check for this in debug mode.  Use unsigned math so we only have
1568 // to do one comparison.
1569 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1570     GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1571                      uintptr_t((dest).size()))
1572 
StrAppend(std::string * result,const AlphaNum & a)1573 void StrAppend(std::string *result, const AlphaNum &a) {
1574   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1575   result->append(a.data(), a.size());
1576 }
1577 
StrAppend(std::string * result,const AlphaNum & a,const AlphaNum & b)1578 void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b) {
1579   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1580   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1581   std::string::size_type old_size = result->size();
1582   result->resize(old_size + a.size() + b.size());
1583   char *const begin = &*result->begin();
1584   char *out = Append2(begin + old_size, a, b);
1585   GOOGLE_DCHECK_EQ(out, begin + result->size());
1586 }
1587 
StrAppend(std::string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1588 void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b,
1589                const AlphaNum &c) {
1590   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1591   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1592   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1593   std::string::size_type old_size = result->size();
1594   result->resize(old_size + a.size() + b.size() + c.size());
1595   char *const begin = &*result->begin();
1596   char *out = Append2(begin + old_size, a, b);
1597   out = Append1(out, c);
1598   GOOGLE_DCHECK_EQ(out, begin + result->size());
1599 }
1600 
StrAppend(std::string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1601 void StrAppend(std::string *result, const AlphaNum &a, const AlphaNum &b,
1602                const AlphaNum &c, const AlphaNum &d) {
1603   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1604   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1605   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1606   GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1607   std::string::size_type old_size = result->size();
1608   result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1609   char *const begin = &*result->begin();
1610   char *out = Append4(begin + old_size, a, b, c, d);
1611   GOOGLE_DCHECK_EQ(out, begin + result->size());
1612 }
1613 
GlobalReplaceSubstring(const std::string & substring,const std::string & replacement,std::string * s)1614 int GlobalReplaceSubstring(const std::string &substring,
1615                            const std::string &replacement, std::string *s) {
1616   GOOGLE_CHECK(s != nullptr);
1617   if (s->empty() || substring.empty())
1618     return 0;
1619   std::string tmp;
1620   int num_replacements = 0;
1621   int pos = 0;
1622   for (int match_pos = s->find(substring.data(), pos, substring.length());
1623        match_pos != std::string::npos; pos = match_pos + substring.length(),
1624            match_pos = s->find(substring.data(), pos, substring.length())) {
1625     ++num_replacements;
1626     // Append the original content before the match.
1627     tmp.append(*s, pos, match_pos - pos);
1628     // Append the replacement for the match.
1629     tmp.append(replacement.begin(), replacement.end());
1630   }
1631   // Append the content after the last match. If no replacements were made, the
1632   // original string is left untouched.
1633   if (num_replacements > 0) {
1634     tmp.append(*s, pos, s->length() - pos);
1635     s->swap(tmp);
1636   }
1637   return num_replacements;
1638 }
1639 
CalculateBase64EscapedLen(int input_len,bool do_padding)1640 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1641   // Base64 encodes three bytes of input at a time. If the input is not
1642   // divisible by three, we pad as appropriate.
1643   //
1644   // (from http://tools.ietf.org/html/rfc3548)
1645   // Special processing is performed if fewer than 24 bits are available
1646   // at the end of the data being encoded.  A full encoding quantum is
1647   // always completed at the end of a quantity.  When fewer than 24 input
1648   // bits are available in an input group, zero bits are added (on the
1649   // right) to form an integral number of 6-bit groups.  Padding at the
1650   // end of the data is performed using the '=' character.  Since all base
1651   // 64 input is an integral number of octets, only the following cases
1652   // can arise:
1653 
1654 
1655   // Base64 encodes each three bytes of input into four bytes of output.
1656   int len = (input_len / 3) * 4;
1657 
1658   if (input_len % 3 == 0) {
1659     // (from http://tools.ietf.org/html/rfc3548)
1660     // (1) the final quantum of encoding input is an integral multiple of 24
1661     // bits; here, the final unit of encoded output will be an integral
1662     // multiple of 4 characters with no "=" padding,
1663   } else if (input_len % 3 == 1) {
1664     // (from http://tools.ietf.org/html/rfc3548)
1665     // (2) the final quantum of encoding input is exactly 8 bits; here, the
1666     // final unit of encoded output will be two characters followed by two
1667     // "=" padding characters, or
1668     len += 2;
1669     if (do_padding) {
1670       len += 2;
1671     }
1672   } else {  // (input_len % 3 == 2)
1673     // (from http://tools.ietf.org/html/rfc3548)
1674     // (3) the final quantum of encoding input is exactly 16 bits; here, the
1675     // final unit of encoded output will be three characters followed by one
1676     // "=" padding character.
1677     len += 3;
1678     if (do_padding) {
1679       len += 1;
1680     }
1681   }
1682 
1683   assert(len >= input_len);  // make sure we didn't overflow
1684   return len;
1685 }
1686 
1687 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1688 int CalculateBase64EscapedLen(int input_len) {
1689   return CalculateBase64EscapedLen(input_len, true);
1690 }
1691 
1692 // ----------------------------------------------------------------------
1693 // int Base64Unescape() - base64 decoder
1694 // int Base64Escape() - base64 encoder
1695 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1696 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1697 //
1698 // Check out
1699 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1700 // care about is that...
1701 //   Take the encoded stuff in groups of 4 characters and turn each
1702 //   character into a code 0 to 63 thus:
1703 //           A-Z map to 0 to 25
1704 //           a-z map to 26 to 51
1705 //           0-9 map to 52 to 61
1706 //           +(- for WebSafe) maps to 62
1707 //           /(_ for WebSafe) maps to 63
1708 //   There will be four numbers, all less than 64 which can be represented
1709 //   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1710 //   Arrange the 6 digit binary numbers into three bytes as such:
1711 //   aaaaaabb bbbbcccc ccdddddd
1712 //   Equals signs (one or two) are used at the end of the encoded block to
1713 //   indicate that the text was not an integer multiple of three bytes long.
1714 // ----------------------------------------------------------------------
1715 
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1716 int Base64UnescapeInternal(const char *src_param, int szsrc,
1717                            char *dest, int szdest,
1718                            const signed char* unbase64) {
1719   static const char kPad64Equals = '=';
1720   static const char kPad64Dot = '.';
1721 
1722   int decode = 0;
1723   int destidx = 0;
1724   int state = 0;
1725   unsigned int ch = 0;
1726   unsigned int temp = 0;
1727 
1728   // If "char" is signed by default, using *src as an array index results in
1729   // accessing negative array elements. Treat the input as a pointer to
1730   // unsigned char to avoid this.
1731   const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1732 
1733   // The GET_INPUT macro gets the next input character, skipping
1734   // over any whitespace, and stopping when we reach the end of the
1735   // string or when we read any non-data character.  The arguments are
1736   // an arbitrary identifier (used as a label for goto) and the number
1737   // of data bytes that must remain in the input to avoid aborting the
1738   // loop.
1739 #define GET_INPUT(label, remain)                 \
1740   label:                                         \
1741     --szsrc;                                     \
1742     ch = *src++;                                 \
1743     decode = unbase64[ch];                       \
1744     if (decode < 0) {                            \
1745       if (ascii_isspace(ch) && szsrc >= remain)  \
1746         goto label;                              \
1747       state = 4 - remain;                        \
1748       break;                                     \
1749     }
1750 
1751   // if dest is null, we're just checking to see if it's legal input
1752   // rather than producing output.  (I suspect this could just be done
1753   // with a regexp...).  We duplicate the loop so this test can be
1754   // outside it instead of in every iteration.
1755 
1756   if (dest) {
1757     // This loop consumes 4 input bytes and produces 3 output bytes
1758     // per iteration.  We can't know at the start that there is enough
1759     // data left in the string for a full iteration, so the loop may
1760     // break out in the middle; if so 'state' will be set to the
1761     // number of input bytes read.
1762 
1763     while (szsrc >= 4)  {
1764       // We'll start by optimistically assuming that the next four
1765       // bytes of the string (src[0..3]) are four good data bytes
1766       // (that is, no nulls, whitespace, padding chars, or illegal
1767       // chars).  We need to test src[0..2] for nulls individually
1768       // before constructing temp to preserve the property that we
1769       // never read past a null in the string (no matter how long
1770       // szsrc claims the string is).
1771 
1772       if (!src[0] || !src[1] || !src[2] ||
1773           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1774                    (unsigned(unbase64[src[1]]) << 12) |
1775                    (unsigned(unbase64[src[2]]) << 6) |
1776                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1777         // Iff any of those four characters was bad (null, illegal,
1778         // whitespace, padding), then temp's high bit will be set
1779         // (because unbase64[] is -1 for all bad characters).
1780         //
1781         // We'll back up and resort to the slower decoder, which knows
1782         // how to handle those cases.
1783 
1784         GET_INPUT(first, 4);
1785         temp = decode;
1786         GET_INPUT(second, 3);
1787         temp = (temp << 6) | decode;
1788         GET_INPUT(third, 2);
1789         temp = (temp << 6) | decode;
1790         GET_INPUT(fourth, 1);
1791         temp = (temp << 6) | decode;
1792       } else {
1793         // We really did have four good data bytes, so advance four
1794         // characters in the string.
1795 
1796         szsrc -= 4;
1797         src += 4;
1798         decode = -1;
1799         ch = '\0';
1800       }
1801 
1802       // temp has 24 bits of input, so write that out as three bytes.
1803 
1804       if (destidx+3 > szdest) return -1;
1805       dest[destidx+2] = temp;
1806       temp >>= 8;
1807       dest[destidx+1] = temp;
1808       temp >>= 8;
1809       dest[destidx] = temp;
1810       destidx += 3;
1811     }
1812   } else {
1813     while (szsrc >= 4)  {
1814       if (!src[0] || !src[1] || !src[2] ||
1815           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1816                    (unsigned(unbase64[src[1]]) << 12) |
1817                    (unsigned(unbase64[src[2]]) << 6) |
1818                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1819         GET_INPUT(first_no_dest, 4);
1820         GET_INPUT(second_no_dest, 3);
1821         GET_INPUT(third_no_dest, 2);
1822         GET_INPUT(fourth_no_dest, 1);
1823       } else {
1824         szsrc -= 4;
1825         src += 4;
1826         decode = -1;
1827         ch = '\0';
1828       }
1829       destidx += 3;
1830     }
1831   }
1832 
1833 #undef GET_INPUT
1834 
1835   // if the loop terminated because we read a bad character, return
1836   // now.
1837   if (decode < 0 && ch != '\0' &&
1838       ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1839     return -1;
1840 
1841   if (ch == kPad64Equals || ch == kPad64Dot) {
1842     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1843     // look at it again when we count to check for the proper number of
1844     // equals signs at the end.
1845     ++szsrc;
1846     --src;
1847   } else {
1848     // This loop consumes 1 input byte per iteration.  It's used to
1849     // clean up the 0-3 input bytes remaining when the first, faster
1850     // loop finishes.  'temp' contains the data from 'state' input
1851     // characters read by the first loop.
1852     while (szsrc > 0)  {
1853       --szsrc;
1854       ch = *src++;
1855       decode = unbase64[ch];
1856       if (decode < 0) {
1857         if (ascii_isspace(ch)) {
1858           continue;
1859         } else if (ch == '\0') {
1860           break;
1861         } else if (ch == kPad64Equals || ch == kPad64Dot) {
1862           // back up one character; we'll read it again when we check
1863           // for the correct number of pad characters at the end.
1864           ++szsrc;
1865           --src;
1866           break;
1867         } else {
1868           return -1;
1869         }
1870       }
1871 
1872       // Each input character gives us six bits of output.
1873       temp = (temp << 6) | decode;
1874       ++state;
1875       if (state == 4) {
1876         // If we've accumulated 24 bits of output, write that out as
1877         // three bytes.
1878         if (dest) {
1879           if (destidx+3 > szdest) return -1;
1880           dest[destidx+2] = temp;
1881           temp >>= 8;
1882           dest[destidx+1] = temp;
1883           temp >>= 8;
1884           dest[destidx] = temp;
1885         }
1886         destidx += 3;
1887         state = 0;
1888         temp = 0;
1889       }
1890     }
1891   }
1892 
1893   // Process the leftover data contained in 'temp' at the end of the input.
1894   int expected_equals = 0;
1895   switch (state) {
1896     case 0:
1897       // Nothing left over; output is a multiple of 3 bytes.
1898       break;
1899 
1900     case 1:
1901       // Bad input; we have 6 bits left over.
1902       return -1;
1903 
1904     case 2:
1905       // Produce one more output byte from the 12 input bits we have left.
1906       if (dest) {
1907         if (destidx+1 > szdest) return -1;
1908         temp >>= 4;
1909         dest[destidx] = temp;
1910       }
1911       ++destidx;
1912       expected_equals = 2;
1913       break;
1914 
1915     case 3:
1916       // Produce two more output bytes from the 18 input bits we have left.
1917       if (dest) {
1918         if (destidx+2 > szdest) return -1;
1919         temp >>= 2;
1920         dest[destidx+1] = temp;
1921         temp >>= 8;
1922         dest[destidx] = temp;
1923       }
1924       destidx += 2;
1925       expected_equals = 1;
1926       break;
1927 
1928     default:
1929       // state should have no other values at this point.
1930       GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1931   }
1932 
1933   // The remainder of the string should be all whitespace, mixed with
1934   // exactly 0 equals signs, or exactly 'expected_equals' equals
1935   // signs.  (Always accepting 0 equals signs is a google extension
1936   // not covered in the RFC, as is accepting dot as the pad character.)
1937 
1938   int equals = 0;
1939   while (szsrc > 0 && *src) {
1940     if (*src == kPad64Equals || *src == kPad64Dot)
1941       ++equals;
1942     else if (!ascii_isspace(*src))
1943       return -1;
1944     --szsrc;
1945     ++src;
1946   }
1947 
1948   return (equals == 0 || equals == expected_equals) ? destidx : -1;
1949 }
1950 
1951 // The arrays below were generated by the following code
1952 // #include <sys/time.h>
1953 // #include <stdlib.h>
1954 // #include <string.h>
1955 // #include <stdio.h>
1956 // main()
1957 // {
1958 //   static const char Base64[] =
1959 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1960 //   const char *pos;
1961 //   int idx, i, j;
1962 //   printf("    ");
1963 //   for (i = 0; i < 255; i += 8) {
1964 //     for (j = i; j < i + 8; j++) {
1965 //       pos = strchr(Base64, j);
1966 //       if ((pos == nullptr) || (j == 0))
1967 //         idx = -1;
1968 //       else
1969 //         idx = pos - Base64;
1970 //       if (idx == -1)
1971 //         printf(" %2d,     ", idx);
1972 //       else
1973 //         printf(" %2d/""*%c*""/,", idx, j);
1974 //     }
1975 //     printf("\n    ");
1976 //   }
1977 // }
1978 //
1979 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1980 // tables from the functions below.
1981 static const signed char kUnBase64[] = {
1982   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1983   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1984   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1985   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1986   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1987   -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
1988   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1989   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
1990   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
1991    7/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1992   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1993   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
1994   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
1995   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
1996   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
1997   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
1998   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1999   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2000   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2001   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2002   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2003   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2004   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2005   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2006   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2007   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2008   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2009   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2010   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2011   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2012   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2013   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2014 };
2015 static const signed char kUnWebSafeBase64[] = {
2016   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2017   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2018   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2019   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2020   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2021   -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
2022   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2023   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
2024   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
2025    7/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2026   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2027   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
2028   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2029   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2030   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2031   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
2032   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2033   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2034   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2035   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2036   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2037   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2038   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2039   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2040   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2041   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2042   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2043   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2044   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2045   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2046   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2047   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2048 };
2049 
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2050 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2051   return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2052 }
2053 
Base64UnescapeInternal(const char * src,int slen,std::string * dest,const signed char * unbase64)2054 static bool Base64UnescapeInternal(const char *src, int slen, std::string *dest,
2055                                    const signed char *unbase64) {
2056   // Determine the size of the output string.  Base64 encodes every 3 bytes into
2057   // 4 characters.  any leftover chars are added directly for good measure.
2058   // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2059   const int dest_len = 3 * (slen / 4) + (slen % 4);
2060 
2061   dest->resize(dest_len);
2062 
2063   // We are getting the destination buffer by getting the beginning of the
2064   // string and converting it into a char *.
2065   const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2066                                          dest_len, unbase64);
2067   if (len < 0) {
2068     dest->clear();
2069     return false;
2070   }
2071 
2072   // could be shorter if there was padding
2073   GOOGLE_DCHECK_LE(len, dest_len);
2074   dest->erase(len);
2075 
2076   return true;
2077 }
2078 
Base64Unescape(StringPiece src,std::string * dest)2079 bool Base64Unescape(StringPiece src, std::string *dest) {
2080   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2081 }
2082 
WebSafeBase64Unescape(StringPiece src,std::string * dest)2083 bool WebSafeBase64Unescape(StringPiece src, std::string *dest) {
2084   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2085 }
2086 
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2087 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2088                          char *dest, int szdest, const char *base64,
2089                          bool do_padding) {
2090   static const char kPad64 = '=';
2091 
2092   if (szsrc <= 0) return 0;
2093 
2094   if (szsrc * 4 > szdest * 3) return 0;
2095 
2096   char *cur_dest = dest;
2097   const unsigned char *cur_src = src;
2098 
2099   char *limit_dest = dest + szdest;
2100   const unsigned char *limit_src = src + szsrc;
2101 
2102   // Three bytes of data encodes to four characters of cyphertext.
2103   // So we can pump through three-byte chunks atomically.
2104   while (cur_src < limit_src - 3) {  // keep going as long as we have >= 32 bits
2105     uint32 in = BigEndian::Load32(cur_src) >> 8;
2106 
2107     cur_dest[0] = base64[in >> 18];
2108     in &= 0x3FFFF;
2109     cur_dest[1] = base64[in >> 12];
2110     in &= 0xFFF;
2111     cur_dest[2] = base64[in >> 6];
2112     in &= 0x3F;
2113     cur_dest[3] = base64[in];
2114 
2115     cur_dest += 4;
2116     cur_src += 3;
2117   }
2118   // To save time, we didn't update szdest or szsrc in the loop.  So do it now.
2119   szdest = limit_dest - cur_dest;
2120   szsrc = limit_src - cur_src;
2121 
2122   /* now deal with the tail (<=3 bytes) */
2123   switch (szsrc) {
2124     case 0:
2125       // Nothing left; nothing more to do.
2126       break;
2127     case 1: {
2128       // One byte left: this encodes to two characters, and (optionally)
2129       // two pad characters to round out the four-character cypherblock.
2130       if ((szdest -= 2) < 0) return 0;
2131       uint32 in = cur_src[0];
2132       cur_dest[0] = base64[in >> 2];
2133       in &= 0x3;
2134       cur_dest[1] = base64[in << 4];
2135       cur_dest += 2;
2136       if (do_padding) {
2137         if ((szdest -= 2) < 0) return 0;
2138         cur_dest[0] = kPad64;
2139         cur_dest[1] = kPad64;
2140         cur_dest += 2;
2141       }
2142       break;
2143     }
2144     case 2: {
2145       // Two bytes left: this encodes to three characters, and (optionally)
2146       // one pad character to round out the four-character cypherblock.
2147       if ((szdest -= 3) < 0) return 0;
2148       uint32 in = BigEndian::Load16(cur_src);
2149       cur_dest[0] = base64[in >> 10];
2150       in &= 0x3FF;
2151       cur_dest[1] = base64[in >> 4];
2152       in &= 0x00F;
2153       cur_dest[2] = base64[in << 2];
2154       cur_dest += 3;
2155       if (do_padding) {
2156         if ((szdest -= 1) < 0) return 0;
2157         cur_dest[0] = kPad64;
2158         cur_dest += 1;
2159       }
2160       break;
2161     }
2162     case 3: {
2163       // Three bytes left: same as in the big loop above.  We can't do this in
2164       // the loop because the loop above always reads 4 bytes, and the fourth
2165       // byte is past the end of the input.
2166       if ((szdest -= 4) < 0) return 0;
2167       uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2168       cur_dest[0] = base64[in >> 18];
2169       in &= 0x3FFFF;
2170       cur_dest[1] = base64[in >> 12];
2171       in &= 0xFFF;
2172       cur_dest[2] = base64[in >> 6];
2173       in &= 0x3F;
2174       cur_dest[3] = base64[in];
2175       cur_dest += 4;
2176       break;
2177     }
2178     default:
2179       // Should not be reached: blocks of 4 bytes are handled
2180       // in the while loop before this switch statement.
2181       GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2182       break;
2183   }
2184   return (cur_dest - dest);
2185 }
2186 
2187 static const char kBase64Chars[] =
2188 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2189 
2190 static const char kWebSafeBase64Chars[] =
2191 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2192 
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2193 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2194   return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2195 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2196 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2197                         int szdest, bool do_padding) {
2198   return Base64EscapeInternal(src, szsrc, dest, szdest,
2199                               kWebSafeBase64Chars, do_padding);
2200 }
2201 
Base64EscapeInternal(const unsigned char * src,int szsrc,std::string * dest,bool do_padding,const char * base64_chars)2202 void Base64EscapeInternal(const unsigned char *src, int szsrc,
2203                           std::string *dest, bool do_padding,
2204                           const char *base64_chars) {
2205   const int calc_escaped_size =
2206     CalculateBase64EscapedLen(szsrc, do_padding);
2207   dest->resize(calc_escaped_size);
2208   const int escaped_len = Base64EscapeInternal(src, szsrc,
2209                                                string_as_array(dest),
2210                                                dest->size(),
2211                                                base64_chars,
2212                                                do_padding);
2213   GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2214   dest->erase(escaped_len);
2215 }
2216 
Base64Escape(const unsigned char * src,int szsrc,std::string * dest,bool do_padding)2217 void Base64Escape(const unsigned char *src, int szsrc, std::string *dest,
2218                   bool do_padding) {
2219   Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2220 }
2221 
WebSafeBase64Escape(const unsigned char * src,int szsrc,std::string * dest,bool do_padding)2222 void WebSafeBase64Escape(const unsigned char *src, int szsrc, std::string *dest,
2223                          bool do_padding) {
2224   Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2225 }
2226 
Base64Escape(StringPiece src,std::string * dest)2227 void Base64Escape(StringPiece src, std::string *dest) {
2228   Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2229                src.size(), dest, true);
2230 }
2231 
WebSafeBase64Escape(StringPiece src,std::string * dest)2232 void WebSafeBase64Escape(StringPiece src, std::string *dest) {
2233   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2234                       src.size(), dest, false);
2235 }
2236 
WebSafeBase64EscapeWithPadding(StringPiece src,std::string * dest)2237 void WebSafeBase64EscapeWithPadding(StringPiece src, std::string *dest) {
2238   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2239                       src.size(), dest, true);
2240 }
2241 
2242 // Helper to append a Unicode code point to a string as UTF8, without bringing
2243 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2244 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2245   uint32 tmp = 0;
2246   int len = 0;
2247   if (code_point <= 0x7f) {
2248     tmp = code_point;
2249     len = 1;
2250   } else if (code_point <= 0x07ff) {
2251     tmp = 0x0000c080 |
2252         ((code_point & 0x07c0) << 2) |
2253         (code_point & 0x003f);
2254     len = 2;
2255   } else if (code_point <= 0xffff) {
2256     tmp = 0x00e08080 |
2257         ((code_point & 0xf000) << 4) |
2258         ((code_point & 0x0fc0) << 2) |
2259         (code_point & 0x003f);
2260     len = 3;
2261   } else {
2262     // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2263     // normally only defined up to there as well.
2264     tmp = 0xf0808080 |
2265         ((code_point & 0x1c0000) << 6) |
2266         ((code_point & 0x03f000) << 4) |
2267         ((code_point & 0x000fc0) << 2) |
2268         (code_point & 0x003f);
2269     len = 4;
2270   }
2271   tmp = ghtonl(tmp);
2272   memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2273   return len;
2274 }
2275 
2276 // Table of UTF-8 character lengths, based on first byte
2277 static const unsigned char kUTF8LenTbl[256] = {
2278     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2279     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2280     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2281     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2282     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2283     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2284 
2285     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2286     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2287     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2288     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2289     2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2290     3, 3, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2291 
2292 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2293 int UTF8FirstLetterNumBytes(const char* src, int len) {
2294   if (len == 0) {
2295     return 0;
2296   }
2297   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2298 }
2299 
2300 // ----------------------------------------------------------------------
2301 // CleanStringLineEndings()
2302 //   Clean up a multi-line string to conform to Unix line endings.
2303 //   Reads from src and appends to dst, so usually dst should be empty.
2304 //
2305 //   If there is no line ending at the end of a non-empty string, it can
2306 //   be added automatically.
2307 //
2308 //   Four different types of input are correctly handled:
2309 //
2310 //     - Unix/Linux files: line ending is LF: pass through unchanged
2311 //
2312 //     - DOS/Windows files: line ending is CRLF: convert to LF
2313 //
2314 //     - Legacy Mac files: line ending is CR: convert to LF
2315 //
2316 //     - Garbled files: random line endings: convert gracefully
2317 //                      lonely CR, lonely LF, CRLF: convert to LF
2318 //
2319 //   @param src The multi-line string to convert
2320 //   @param dst The converted string is appended to this string
2321 //   @param auto_end_last_line Automatically terminate the last line
2322 //
2323 //   Limitations:
2324 //
2325 //     This does not do the right thing for CRCRLF files created by
2326 //     broken programs that do another Unix->DOS conversion on files
2327 //     that are already in CRLF format.  For this, a two-pass approach
2328 //     brute-force would be needed that
2329 //
2330 //       (1) determines the presence of LF (first one is ok)
2331 //       (2) if yes, removes any CR, else convert every CR to LF
2332 
CleanStringLineEndings(const std::string & src,std::string * dst,bool auto_end_last_line)2333 void CleanStringLineEndings(const std::string &src, std::string *dst,
2334                             bool auto_end_last_line) {
2335   if (dst->empty()) {
2336     dst->append(src);
2337     CleanStringLineEndings(dst, auto_end_last_line);
2338   } else {
2339     std::string tmp = src;
2340     CleanStringLineEndings(&tmp, auto_end_last_line);
2341     dst->append(tmp);
2342   }
2343 }
2344 
CleanStringLineEndings(std::string * str,bool auto_end_last_line)2345 void CleanStringLineEndings(std::string *str, bool auto_end_last_line) {
2346   ptrdiff_t output_pos = 0;
2347   bool r_seen = false;
2348   ptrdiff_t len = str->size();
2349 
2350   char *p = &(*str)[0];
2351 
2352   for (ptrdiff_t input_pos = 0; input_pos < len;) {
2353     if (!r_seen && input_pos + 8 < len) {
2354       uint64_t v = GOOGLE_UNALIGNED_LOAD64(p + input_pos);
2355       // Loop over groups of 8 bytes at a time until we come across
2356       // a word that has a byte whose value is less than or equal to
2357       // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
2358       //
2359       // We use a has_less macro that quickly tests a whole 64-bit
2360       // word to see if any of the bytes has a value < N.
2361       //
2362       // For more details, see:
2363       //   http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
2364 #define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128)
2365       if (!has_less(v, '\r' + 1)) {
2366 #undef has_less
2367         // No byte in this word has a value that could be a \r or a \n
2368         if (output_pos != input_pos) {
2369           GOOGLE_UNALIGNED_STORE64(p + output_pos, v);
2370         }
2371         input_pos += 8;
2372         output_pos += 8;
2373         continue;
2374       }
2375     }
2376     std::string::const_reference in = p[input_pos];
2377     if (in == '\r') {
2378       if (r_seen) p[output_pos++] = '\n';
2379       r_seen = true;
2380     } else if (in == '\n') {
2381       if (input_pos != output_pos)
2382         p[output_pos++] = '\n';
2383       else
2384         output_pos++;
2385       r_seen = false;
2386     } else {
2387       if (r_seen) p[output_pos++] = '\n';
2388       r_seen = false;
2389       if (input_pos != output_pos)
2390         p[output_pos++] = in;
2391       else
2392         output_pos++;
2393     }
2394     input_pos++;
2395   }
2396   if (r_seen ||
2397       (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) {
2398     str->resize(output_pos + 1);
2399     str->operator[](output_pos) = '\n';
2400   } else if (output_pos < len) {
2401     str->resize(output_pos);
2402   }
2403 }
2404 
2405 namespace internal {
2406 
2407 // ----------------------------------------------------------------------
2408 // NoLocaleStrtod()
2409 //   This code will make you cry.
2410 // ----------------------------------------------------------------------
2411 
2412 namespace {
2413 
2414 // Returns a string identical to *input except that the character pointed to
2415 // by radix_pos (which should be '.') is replaced with the locale-specific
2416 // radix character.
LocalizeRadix(const char * input,const char * radix_pos)2417 std::string LocalizeRadix(const char *input, const char *radix_pos) {
2418   // Determine the locale-specific radix character by calling sprintf() to
2419   // print the number 1.5, then stripping off the digits.  As far as I can
2420   // tell, this is the only portable, thread-safe way to get the C library
2421   // to divuldge the locale's radix character.  No, localeconv() is NOT
2422   // thread-safe.
2423   char temp[16];
2424   int size = snprintf(temp, sizeof(temp), "%.1f", 1.5);
2425   GOOGLE_CHECK_EQ(temp[0], '1');
2426   GOOGLE_CHECK_EQ(temp[size - 1], '5');
2427   GOOGLE_CHECK_LE(size, 6);
2428 
2429   // Now replace the '.' in the input with it.
2430   std::string result;
2431   result.reserve(strlen(input) + size - 3);
2432   result.append(input, radix_pos);
2433   result.append(temp + 1, size - 2);
2434   result.append(radix_pos + 1);
2435   return result;
2436 }
2437 
2438 }  // namespace
2439 
NoLocaleStrtod(const char * str,char ** endptr)2440 double NoLocaleStrtod(const char *str, char **endptr) {
2441   // We cannot simply set the locale to "C" temporarily with setlocale()
2442   // as this is not thread-safe.  Instead, we try to parse in the current
2443   // locale first.  If parsing stops at a '.' character, then this is a
2444   // pretty good hint that we're actually in some other locale in which
2445   // '.' is not the radix character.
2446 
2447   char *temp_endptr;
2448   double result = strtod(str, &temp_endptr);
2449   if (endptr != NULL) *endptr = temp_endptr;
2450   if (*temp_endptr != '.') return result;
2451 
2452   // Parsing halted on a '.'.  Perhaps we're in a different locale?  Let's
2453   // try to replace the '.' with a locale-specific radix character and
2454   // try again.
2455   std::string localized = LocalizeRadix(str, temp_endptr);
2456   const char *localized_cstr = localized.c_str();
2457   char *localized_endptr;
2458   result = strtod(localized_cstr, &localized_endptr);
2459   if ((localized_endptr - localized_cstr) > (temp_endptr - str)) {
2460     // This attempt got further, so replacing the decimal must have helped.
2461     // Update endptr to point at the right location.
2462     if (endptr != NULL) {
2463       // size_diff is non-zero if the localized radix has multiple bytes.
2464       int size_diff = localized.size() - strlen(str);
2465       // const_cast is necessary to match the strtod() interface.
2466       *endptr = const_cast<char *>(
2467           str + (localized_endptr - localized_cstr - size_diff));
2468     }
2469   }
2470 
2471   return result;
2472 }
2473 
2474 }  // namespace internal
2475 
2476 }  // namespace protobuf
2477 }  // namespace google
2478