• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // from google3/strings/strutil.cc
32 
33 #include <google/protobuf/stubs/strutil.h>
34 
35 #include <errno.h>
36 #include <float.h>    // FLT_DIG and DBL_DIG
37 #include <limits.h>
38 #include <stdio.h>
39 #include <cmath>
40 #include <iterator>
41 #include <limits>
42 
43 #include <google/protobuf/stubs/logging.h>
44 #include <google/protobuf/stubs/stl_util.h>
45 
46 #ifdef _WIN32
47 // MSVC has only _snprintf, not snprintf.
48 //
49 // MinGW has both snprintf and _snprintf, but they appear to be different
50 // functions.  The former is buggy.  When invoked like so:
51 //   char buffer[32];
52 //   snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
53 // it prints "1.23000e+10".  This is plainly wrong:  %g should never print
54 // trailing zeros after the decimal point.  For some reason this bug only
55 // occurs with some input values, not all.  In any case, _snprintf does the
56 // right thing, so we use it.
57 #define snprintf _snprintf
58 #endif
59 
60 namespace google {
61 namespace protobuf {
62 
63 // These are defined as macros on some platforms.  #undef them so that we can
64 // redefine them.
65 #undef isxdigit
66 #undef isprint
67 
68 // The definitions of these in ctype.h change based on locale.  Since our
69 // string manipulation is all in relation to the protocol buffer and C++
70 // languages, we always want to use the C locale.  So, we re-define these
71 // exactly as we want them.
isxdigit(char c)72 inline bool isxdigit(char c) {
73   return ('0' <= c && c <= '9') ||
74          ('a' <= c && c <= 'f') ||
75          ('A' <= c && c <= 'F');
76 }
77 
isprint(char c)78 inline bool isprint(char c) {
79   return c >= 0x20 && c <= 0x7E;
80 }
81 
82 // ----------------------------------------------------------------------
83 // ReplaceCharacters
84 //    Replaces any occurrence of the character 'remove' (or the characters
85 //    in 'remove') with the character 'replacewith'.
86 // ----------------------------------------------------------------------
ReplaceCharacters(string * s,const char * remove,char replacewith)87 void ReplaceCharacters(string *s, const char *remove, char replacewith) {
88   const char *str_start = s->c_str();
89   const char *str = str_start;
90   for (str = strpbrk(str, remove);
91        str != nullptr;
92        str = strpbrk(str + 1, remove)) {
93     (*s)[str - str_start] = replacewith;
94   }
95 }
96 
StripWhitespace(string * str)97 void StripWhitespace(string* str) {
98   int str_length = str->length();
99 
100   // Strip off leading whitespace.
101   int first = 0;
102   while (first < str_length && ascii_isspace(str->at(first))) {
103     ++first;
104   }
105   // If entire string is white space.
106   if (first == str_length) {
107     str->clear();
108     return;
109   }
110   if (first > 0) {
111     str->erase(0, first);
112     str_length -= first;
113   }
114 
115   // Strip off trailing whitespace.
116   int last = str_length - 1;
117   while (last >= 0 && ascii_isspace(str->at(last))) {
118     --last;
119   }
120   if (last != (str_length - 1) && last >= 0) {
121     str->erase(last + 1, string::npos);
122   }
123 }
124 
125 // ----------------------------------------------------------------------
126 // StringReplace()
127 //    Replace the "old" pattern with the "new" pattern in a string,
128 //    and append the result to "res".  If replace_all is false,
129 //    it only replaces the first instance of "old."
130 // ----------------------------------------------------------------------
131 
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all,string * res)132 void StringReplace(const string& s, const string& oldsub,
133                    const string& newsub, bool replace_all,
134                    string* res) {
135   if (oldsub.empty()) {
136     res->append(s);  // if empty, append the given string.
137     return;
138   }
139 
140   string::size_type start_pos = 0;
141   string::size_type pos;
142   do {
143     pos = s.find(oldsub, start_pos);
144     if (pos == string::npos) {
145       break;
146     }
147     res->append(s, start_pos, pos - start_pos);
148     res->append(newsub);
149     start_pos = pos + oldsub.size();  // start searching again after the "old"
150   } while (replace_all);
151   res->append(s, start_pos, s.length() - start_pos);
152 }
153 
154 // ----------------------------------------------------------------------
155 // StringReplace()
156 //    Give me a string and two patterns "old" and "new", and I replace
157 //    the first instance of "old" in the string with "new", if it
158 //    exists.  If "global" is true; call this repeatedly until it
159 //    fails.  RETURN a new string, regardless of whether the replacement
160 //    happened or not.
161 // ----------------------------------------------------------------------
162 
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all)163 string StringReplace(const string& s, const string& oldsub,
164                      const string& newsub, bool replace_all) {
165   string ret;
166   StringReplace(s, oldsub, newsub, replace_all, &ret);
167   return ret;
168 }
169 
170 // ----------------------------------------------------------------------
171 // SplitStringUsing()
172 //    Split a string using a character delimiter. Append the components
173 //    to 'result'.
174 //
175 // Note: For multi-character delimiters, this routine will split on *ANY* of
176 // the characters in the string, not the entire string as a single delimiter.
177 // ----------------------------------------------------------------------
178 template <typename ITR>
SplitStringToIteratorUsing(StringPiece full,const char * delim,ITR & result)179 static inline void SplitStringToIteratorUsing(StringPiece full,
180                                               const char *delim, ITR &result) {
181   // Optimize the common case where delim is a single character.
182   if (delim[0] != '\0' && delim[1] == '\0') {
183     char c = delim[0];
184     const char* p = full.data();
185     const char* end = p + full.size();
186     while (p != end) {
187       if (*p == c) {
188         ++p;
189       } else {
190         const char* start = p;
191         while (++p != end && *p != c);
192         *result++ = std::string(start, p - start);
193       }
194     }
195     return;
196   }
197 
198   string::size_type begin_index, end_index;
199   begin_index = full.find_first_not_of(delim);
200   while (begin_index != string::npos) {
201     end_index = full.find_first_of(delim, begin_index);
202     if (end_index == string::npos) {
203       *result++ = std::string(full.substr(begin_index));
204       return;
205     }
206     *result++ =
207         std::string(full.substr(begin_index, (end_index - begin_index)));
208     begin_index = full.find_first_not_of(delim, end_index);
209   }
210 }
211 
SplitStringUsing(StringPiece full,const char * delim,std::vector<string> * result)212 void SplitStringUsing(StringPiece full, const char *delim,
213                       std::vector<string> *result) {
214   std::back_insert_iterator< std::vector<string> > it(*result);
215   SplitStringToIteratorUsing(full, delim, it);
216 }
217 
218 // Split a string using a character delimiter. Append the components
219 // to 'result'.  If there are consecutive delimiters, this function
220 // will return corresponding empty strings. The string is split into
221 // at most the specified number of pieces greedily. This means that the
222 // last piece may possibly be split further. To split into as many pieces
223 // as possible, specify 0 as the number of pieces.
224 //
225 // If "full" is the empty string, yields an empty string as the only value.
226 //
227 // If "pieces" is negative for some reason, it returns the whole string
228 // ----------------------------------------------------------------------
229 template <typename ITR>
SplitStringToIteratorAllowEmpty(StringPiece full,const char * delim,int pieces,ITR & result)230 static inline void SplitStringToIteratorAllowEmpty(StringPiece full,
231                                                    const char *delim,
232                                                    int pieces, ITR &result) {
233   string::size_type begin_index, end_index;
234   begin_index = 0;
235 
236   for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
237     end_index = full.find_first_of(delim, begin_index);
238     if (end_index == string::npos) {
239       *result++ = std::string(full.substr(begin_index));
240       return;
241     }
242     *result++ =
243         std::string(full.substr(begin_index, (end_index - begin_index)));
244     begin_index = end_index + 1;
245   }
246   *result++ = std::string(full.substr(begin_index));
247 }
248 
SplitStringAllowEmpty(StringPiece full,const char * delim,std::vector<string> * result)249 void SplitStringAllowEmpty(StringPiece full, const char *delim,
250                            std::vector<string> *result) {
251   std::back_insert_iterator<std::vector<string> > it(*result);
252   SplitStringToIteratorAllowEmpty(full, delim, 0, it);
253 }
254 
255 // ----------------------------------------------------------------------
256 // JoinStrings()
257 //    This merges a vector of string components with delim inserted
258 //    as separaters between components.
259 //
260 // ----------------------------------------------------------------------
261 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,string * result)262 static void JoinStringsIterator(const ITERATOR& start,
263                                 const ITERATOR& end,
264                                 const char* delim,
265                                 string* result) {
266   GOOGLE_CHECK(result != nullptr);
267   result->clear();
268   int delim_length = strlen(delim);
269 
270   // Precompute resulting length so we can reserve() memory in one shot.
271   int length = 0;
272   for (ITERATOR iter = start; iter != end; ++iter) {
273     if (iter != start) {
274       length += delim_length;
275     }
276     length += iter->size();
277   }
278   result->reserve(length);
279 
280   // Now combine everything.
281   for (ITERATOR iter = start; iter != end; ++iter) {
282     if (iter != start) {
283       result->append(delim, delim_length);
284     }
285     result->append(iter->data(), iter->size());
286   }
287 }
288 
JoinStrings(const std::vector<string> & components,const char * delim,string * result)289 void JoinStrings(const std::vector<string>& components,
290                  const char* delim,
291                  string * result) {
292   JoinStringsIterator(components.begin(), components.end(), delim, result);
293 }
294 
295 // ----------------------------------------------------------------------
296 // UnescapeCEscapeSequences()
297 //    This does all the unescaping that C does: \ooo, \r, \n, etc
298 //    Returns length of resulting string.
299 //    The implementation of \x parses any positive number of hex digits,
300 //    but it is an error if the value requires more than 8 bits, and the
301 //    result is truncated to 8 bits.
302 //
303 //    The second call stores its errors in a supplied string vector.
304 //    If the string vector pointer is nullptr, it reports the errors with LOG().
305 // ----------------------------------------------------------------------
306 
307 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
308 
309 // Protocol buffers doesn't ever care about errors, but I don't want to remove
310 // the code.
311 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
312 
UnescapeCEscapeSequences(const char * source,char * dest)313 int UnescapeCEscapeSequences(const char* source, char* dest) {
314   return UnescapeCEscapeSequences(source, dest, nullptr);
315 }
316 
UnescapeCEscapeSequences(const char * source,char * dest,std::vector<string> * errors)317 int UnescapeCEscapeSequences(const char* source, char* dest,
318                              std::vector<string> *errors) {
319   GOOGLE_DCHECK(errors == nullptr) << "Error reporting not implemented.";
320 
321   char* d = dest;
322   const char* p = source;
323 
324   // Small optimization for case where source = dest and there's no escaping
325   while ( p == d && *p != '\0' && *p != '\\' )
326     p++, d++;
327 
328   while (*p != '\0') {
329     if (*p != '\\') {
330       *d++ = *p++;
331     } else {
332       switch ( *++p ) {                    // skip past the '\\'
333         case '\0':
334           LOG_STRING(ERROR, errors) << "String cannot end with \\";
335           *d = '\0';
336           return d - dest;   // we're done with p
337         case 'a':  *d++ = '\a';  break;
338         case 'b':  *d++ = '\b';  break;
339         case 'f':  *d++ = '\f';  break;
340         case 'n':  *d++ = '\n';  break;
341         case 'r':  *d++ = '\r';  break;
342         case 't':  *d++ = '\t';  break;
343         case 'v':  *d++ = '\v';  break;
344         case '\\': *d++ = '\\';  break;
345         case '?':  *d++ = '\?';  break;    // \?  Who knew?
346         case '\'': *d++ = '\'';  break;
347         case '"':  *d++ = '\"';  break;
348         case '0': case '1': case '2': case '3':  // octal digit: 1 to 3 digits
349         case '4': case '5': case '6': case '7': {
350           char ch = *p - '0';
351           if ( IS_OCTAL_DIGIT(p[1]) )
352             ch = ch * 8 + *++p - '0';
353           if ( IS_OCTAL_DIGIT(p[1]) )      // safe (and easy) to do this twice
354             ch = ch * 8 + *++p - '0';      // now points at last digit
355           *d++ = ch;
356           break;
357         }
358         case 'x': case 'X': {
359           if (!isxdigit(p[1])) {
360             if (p[1] == '\0') {
361               LOG_STRING(ERROR, errors) << "String cannot end with \\x";
362             } else {
363               LOG_STRING(ERROR, errors) <<
364                 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
365             }
366             break;
367           }
368           unsigned int ch = 0;
369           const char *hex_start = p;
370           while (isxdigit(p[1]))  // arbitrarily many hex digits
371             ch = (ch << 4) + hex_digit_to_int(*++p);
372           if (ch > 0xFF)
373             LOG_STRING(ERROR, errors) << "Value of " <<
374               "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
375           *d++ = ch;
376           break;
377         }
378 #if 0  // TODO(kenton):  Support \u and \U?  Requires runetochar().
379         case 'u': {
380           // \uhhhh => convert 4 hex digits to UTF-8
381           char32 rune = 0;
382           const char *hex_start = p;
383           for (int i = 0; i < 4; ++i) {
384             if (isxdigit(p[1])) {  // Look one char ahead.
385               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
386             } else {
387               LOG_STRING(ERROR, errors)
388                 << "\\u must be followed by 4 hex digits: \\"
389                 <<  string(hex_start, p+1-hex_start);
390               break;
391             }
392           }
393           d += runetochar(d, &rune);
394           break;
395         }
396         case 'U': {
397           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
398           char32 rune = 0;
399           const char *hex_start = p;
400           for (int i = 0; i < 8; ++i) {
401             if (isxdigit(p[1])) {  // Look one char ahead.
402               // Don't change rune until we're sure this
403               // is within the Unicode limit, but do advance p.
404               char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
405               if (newrune > 0x10FFFF) {
406                 LOG_STRING(ERROR, errors)
407                   << "Value of \\"
408                   << string(hex_start, p + 1 - hex_start)
409                   << " exceeds Unicode limit (0x10FFFF)";
410                 break;
411               } else {
412                 rune = newrune;
413               }
414             } else {
415               LOG_STRING(ERROR, errors)
416                 << "\\U must be followed by 8 hex digits: \\"
417                 <<  string(hex_start, p+1-hex_start);
418               break;
419             }
420           }
421           d += runetochar(d, &rune);
422           break;
423         }
424 #endif
425         default:
426           LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
427       }
428       p++;                                 // read past letter we escaped
429     }
430   }
431   *d = '\0';
432   return d - dest;
433 }
434 
435 // ----------------------------------------------------------------------
436 // UnescapeCEscapeString()
437 //    This does the same thing as UnescapeCEscapeSequences, but creates
438 //    a new string. The caller does not need to worry about allocating
439 //    a dest buffer. This should be used for non performance critical
440 //    tasks such as printing debug messages. It is safe for src and dest
441 //    to be the same.
442 //
443 //    The second call stores its errors in a supplied string vector.
444 //    If the string vector pointer is nullptr, it reports the errors with LOG().
445 //
446 //    In the first and second calls, the length of dest is returned. In the
447 //    the third call, the new string is returned.
448 // ----------------------------------------------------------------------
UnescapeCEscapeString(const string & src,string * dest)449 int UnescapeCEscapeString(const string& src, string* dest) {
450   return UnescapeCEscapeString(src, dest, nullptr);
451 }
452 
UnescapeCEscapeString(const string & src,string * dest,std::vector<string> * errors)453 int UnescapeCEscapeString(const string& src, string* dest,
454                           std::vector<string> *errors) {
455   std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
456   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
457   GOOGLE_CHECK(dest);
458   dest->assign(unescaped.get(), len);
459   return len;
460 }
461 
UnescapeCEscapeString(const string & src)462 string UnescapeCEscapeString(const string& src) {
463   std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
464   int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
465   return string(unescaped.get(), len);
466 }
467 
468 // ----------------------------------------------------------------------
469 // CEscapeString()
470 // CHexEscapeString()
471 //    Copies 'src' to 'dest', escaping dangerous characters using
472 //    C-style escape sequences. This is very useful for preparing query
473 //    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
474 //    hexadecimal rather than octal sequences.
475 //    Returns the number of bytes written to 'dest' (not including the \0)
476 //    or -1 if there was insufficient space.
477 //
478 //    Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
479 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)480 int CEscapeInternal(const char* src, int src_len, char* dest,
481                     int dest_len, bool use_hex, bool utf8_safe) {
482   const char* src_end = src + src_len;
483   int used = 0;
484   bool last_hex_escape = false; // true if last output char was \xNN
485 
486   for (; src < src_end; src++) {
487     if (dest_len - used < 2)   // Need space for two letter escape
488       return -1;
489 
490     bool is_hex_escape = false;
491     switch (*src) {
492       case '\n': dest[used++] = '\\'; dest[used++] = 'n';  break;
493       case '\r': dest[used++] = '\\'; dest[used++] = 'r';  break;
494       case '\t': dest[used++] = '\\'; dest[used++] = 't';  break;
495       case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
496       case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
497       case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
498       default:
499         // Note that if we emit \xNN and the src character after that is a hex
500         // digit then that digit must be escaped too to prevent it being
501         // interpreted as part of the character code by C.
502         if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
503             (!isprint(*src) ||
504              (last_hex_escape && isxdigit(*src)))) {
505           if (dest_len - used < 4) // need space for 4 letter escape
506             return -1;
507           sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
508                   static_cast<uint8>(*src));
509           is_hex_escape = use_hex;
510           used += 4;
511         } else {
512           dest[used++] = *src; break;
513         }
514     }
515     last_hex_escape = is_hex_escape;
516   }
517 
518   if (dest_len - used < 1)   // make sure that there is room for \0
519     return -1;
520 
521   dest[used] = '\0';   // doesn't count towards return value though
522   return used;
523 }
524 
525 // Calculates the length of the C-style escaped version of 'src'.
526 // Assumes that non-printable characters are escaped using octal sequences, and
527 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)528 static inline size_t CEscapedLength(StringPiece src) {
529   static char c_escaped_len[256] = {
530     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
531     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
532     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
533     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
534     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
535     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
536     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
537     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
538     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
539     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
540     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
541     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
542     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
543     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
544     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
545     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
546   };
547 
548   size_t escaped_len = 0;
549   for (int i = 0; i < src.size(); ++i) {
550     unsigned char c = static_cast<unsigned char>(src[i]);
551     escaped_len += c_escaped_len[c];
552   }
553   return escaped_len;
554 }
555 
556 // ----------------------------------------------------------------------
557 // Escapes 'src' using C-style escape sequences, and appends the escaped string
558 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
559 // the required space using a lookup table, and also does not do any special
560 // handling for Hex or UTF-8 characters.
561 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,string * dest)562 void CEscapeAndAppend(StringPiece src, string* dest) {
563   size_t escaped_len = CEscapedLength(src);
564   if (escaped_len == src.size()) {
565     dest->append(src.data(), src.size());
566     return;
567   }
568 
569   size_t cur_dest_len = dest->size();
570   dest->resize(cur_dest_len + escaped_len);
571   char* append_ptr = &(*dest)[cur_dest_len];
572 
573   for (int i = 0; i < src.size(); ++i) {
574     unsigned char c = static_cast<unsigned char>(src[i]);
575     switch (c) {
576       case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
577       case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
578       case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
579       case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
580       case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
581       case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
582       default:
583         if (!isprint(c)) {
584           *append_ptr++ = '\\';
585           *append_ptr++ = '0' + c / 64;
586           *append_ptr++ = '0' + (c % 64) / 8;
587           *append_ptr++ = '0' + c % 8;
588         } else {
589           *append_ptr++ = c;
590         }
591         break;
592     }
593   }
594 }
595 
CEscape(const string & src)596 string CEscape(const string& src) {
597   string dest;
598   CEscapeAndAppend(src, &dest);
599   return dest;
600 }
601 
602 namespace strings {
603 
Utf8SafeCEscape(const string & src)604 string Utf8SafeCEscape(const string& src) {
605   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
606   std::unique_ptr<char[]> dest(new char[dest_length]);
607   const int len = CEscapeInternal(src.data(), src.size(),
608                                   dest.get(), dest_length, false, true);
609   GOOGLE_DCHECK_GE(len, 0);
610   return string(dest.get(), len);
611 }
612 
CHexEscape(const string & src)613 string CHexEscape(const string& src) {
614   const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
615   std::unique_ptr<char[]> dest(new char[dest_length]);
616   const int len = CEscapeInternal(src.data(), src.size(),
617                                   dest.get(), dest_length, true, false);
618   GOOGLE_DCHECK_GE(len, 0);
619   return string(dest.get(), len);
620 }
621 
622 }  // namespace strings
623 
624 // ----------------------------------------------------------------------
625 // strto32_adaptor()
626 // strtou32_adaptor()
627 //    Implementation of strto[u]l replacements that have identical
628 //    overflow and underflow characteristics for both ILP-32 and LP-64
629 //    platforms, including errno preservation in error-free calls.
630 // ----------------------------------------------------------------------
631 
strto32_adaptor(const char * nptr,char ** endptr,int base)632 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
633   const int saved_errno = errno;
634   errno = 0;
635   const long result = strtol(nptr, endptr, base);
636   if (errno == ERANGE && result == LONG_MIN) {
637     return kint32min;
638   } else if (errno == ERANGE && result == LONG_MAX) {
639     return kint32max;
640   } else if (errno == 0 && result < kint32min) {
641     errno = ERANGE;
642     return kint32min;
643   } else if (errno == 0 && result > kint32max) {
644     errno = ERANGE;
645     return kint32max;
646   }
647   if (errno == 0)
648     errno = saved_errno;
649   return static_cast<int32>(result);
650 }
651 
strtou32_adaptor(const char * nptr,char ** endptr,int base)652 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
653   const int saved_errno = errno;
654   errno = 0;
655   const unsigned long result = strtoul(nptr, endptr, base);
656   if (errno == ERANGE && result == ULONG_MAX) {
657     return kuint32max;
658   } else if (errno == 0 && result > kuint32max) {
659     errno = ERANGE;
660     return kuint32max;
661   }
662   if (errno == 0)
663     errno = saved_errno;
664   return static_cast<uint32>(result);
665 }
666 
safe_parse_sign(string * text,bool * negative_ptr)667 inline bool safe_parse_sign(string* text  /*inout*/,
668                             bool* negative_ptr  /*output*/) {
669   const char* start = text->data();
670   const char* end = start + text->size();
671 
672   // Consume whitespace.
673   while (start < end && (start[0] == ' ')) {
674     ++start;
675   }
676   while (start < end && (end[-1] == ' ')) {
677     --end;
678   }
679   if (start >= end) {
680     return false;
681   }
682 
683   // Consume sign.
684   *negative_ptr = (start[0] == '-');
685   if (*negative_ptr || start[0] == '+') {
686     ++start;
687     if (start >= end) {
688       return false;
689     }
690   }
691   *text = text->substr(start - text->data(), end - start);
692   return true;
693 }
694 
695 template<typename IntType>
safe_parse_positive_int(string text,IntType * value_p)696 bool safe_parse_positive_int(
697     string text, IntType* value_p) {
698   int base = 10;
699   IntType value = 0;
700   const IntType vmax = std::numeric_limits<IntType>::max();
701   assert(vmax > 0);
702   assert(vmax >= base);
703   const IntType vmax_over_base = vmax / base;
704   const char* start = text.data();
705   const char* end = start + text.size();
706   // loop over digits
707   for (; start < end; ++start) {
708     unsigned char c = static_cast<unsigned char>(start[0]);
709     int digit = c - '0';
710     if (digit >= base || digit < 0) {
711       *value_p = value;
712       return false;
713     }
714     if (value > vmax_over_base) {
715       *value_p = vmax;
716       return false;
717     }
718     value *= base;
719     if (value > vmax - digit) {
720       *value_p = vmax;
721       return false;
722     }
723     value += digit;
724   }
725   *value_p = value;
726   return true;
727 }
728 
729 template<typename IntType>
safe_parse_negative_int(const string & text,IntType * value_p)730 bool safe_parse_negative_int(
731     const string& text, IntType* value_p) {
732   int base = 10;
733   IntType value = 0;
734   const IntType vmin = std::numeric_limits<IntType>::min();
735   assert(vmin < 0);
736   assert(vmin <= 0 - base);
737   IntType vmin_over_base = vmin / base;
738   // 2003 c++ standard [expr.mul]
739   // "... the sign of the remainder is implementation-defined."
740   // Although (vmin/base)*base + vmin%base is always vmin.
741   // 2011 c++ standard tightens the spec but we cannot rely on it.
742   if (vmin % base > 0) {
743     vmin_over_base += 1;
744   }
745   const char* start = text.data();
746   const char* end = start + text.size();
747   // loop over digits
748   for (; start < end; ++start) {
749     unsigned char c = static_cast<unsigned char>(start[0]);
750     int digit = c - '0';
751     if (digit >= base || digit < 0) {
752       *value_p = value;
753       return false;
754     }
755     if (value < vmin_over_base) {
756       *value_p = vmin;
757       return false;
758     }
759     value *= base;
760     if (value < vmin + digit) {
761       *value_p = vmin;
762       return false;
763     }
764     value -= digit;
765   }
766   *value_p = value;
767   return true;
768 }
769 
770 template<typename IntType>
safe_int_internal(string text,IntType * value_p)771 bool safe_int_internal(string text, IntType* value_p) {
772   *value_p = 0;
773   bool negative;
774   if (!safe_parse_sign(&text, &negative)) {
775     return false;
776   }
777   if (!negative) {
778     return safe_parse_positive_int(text, value_p);
779   } else {
780     return safe_parse_negative_int(text, value_p);
781   }
782 }
783 
784 template<typename IntType>
safe_uint_internal(string text,IntType * value_p)785 bool safe_uint_internal(string text, IntType* value_p) {
786   *value_p = 0;
787   bool negative;
788   if (!safe_parse_sign(&text, &negative) || negative) {
789     return false;
790   }
791   return safe_parse_positive_int(text, value_p);
792 }
793 
794 // ----------------------------------------------------------------------
795 // FastIntToBuffer()
796 // FastInt64ToBuffer()
797 // FastHexToBuffer()
798 // FastHex64ToBuffer()
799 // FastHex32ToBuffer()
800 // ----------------------------------------------------------------------
801 
802 // Offset into buffer where FastInt64ToBuffer places the end of string
803 // null character.  Also used by FastInt64ToBufferLeft.
804 static const int kFastInt64ToBufferOffset = 21;
805 
FastInt64ToBuffer(int64 i,char * buffer)806 char *FastInt64ToBuffer(int64 i, char* buffer) {
807   // We could collapse the positive and negative sections, but that
808   // would be slightly slower for positive numbers...
809   // 22 bytes is enough to store -2**64, -18446744073709551616.
810   char* p = buffer + kFastInt64ToBufferOffset;
811   *p-- = '\0';
812   if (i >= 0) {
813     do {
814       *p-- = '0' + i % 10;
815       i /= 10;
816     } while (i > 0);
817     return p + 1;
818   } else {
819     // On different platforms, % and / have different behaviors for
820     // negative numbers, so we need to jump through hoops to make sure
821     // we don't divide negative numbers.
822     if (i > -10) {
823       i = -i;
824       *p-- = '0' + i;
825       *p = '-';
826       return p;
827     } else {
828       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
829       i = i + 10;
830       i = -i;
831       *p-- = '0' + i % 10;
832       // Undo what we did a moment ago
833       i = i / 10 + 1;
834       do {
835         *p-- = '0' + i % 10;
836         i /= 10;
837       } while (i > 0);
838       *p = '-';
839       return p;
840     }
841   }
842 }
843 
844 // Offset into buffer where FastInt32ToBuffer places the end of string
845 // null character.  Also used by FastInt32ToBufferLeft
846 static const int kFastInt32ToBufferOffset = 11;
847 
848 // Yes, this is a duplicate of FastInt64ToBuffer.  But, we need this for the
849 // compiler to generate 32 bit arithmetic instructions.  It's much faster, at
850 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)851 char *FastInt32ToBuffer(int32 i, char* buffer) {
852   // We could collapse the positive and negative sections, but that
853   // would be slightly slower for positive numbers...
854   // 12 bytes is enough to store -2**32, -4294967296.
855   char* p = buffer + kFastInt32ToBufferOffset;
856   *p-- = '\0';
857   if (i >= 0) {
858     do {
859       *p-- = '0' + i % 10;
860       i /= 10;
861     } while (i > 0);
862     return p + 1;
863   } else {
864     // On different platforms, % and / have different behaviors for
865     // negative numbers, so we need to jump through hoops to make sure
866     // we don't divide negative numbers.
867     if (i > -10) {
868       i = -i;
869       *p-- = '0' + i;
870       *p = '-';
871       return p;
872     } else {
873       // Make sure we aren't at MIN_INT, in which case we can't say i = -i
874       i = i + 10;
875       i = -i;
876       *p-- = '0' + i % 10;
877       // Undo what we did a moment ago
878       i = i / 10 + 1;
879       do {
880         *p-- = '0' + i % 10;
881         i /= 10;
882       } while (i > 0);
883       *p = '-';
884       return p;
885     }
886   }
887 }
888 
FastHexToBuffer(int i,char * buffer)889 char *FastHexToBuffer(int i, char* buffer) {
890   GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
891 
892   static const char *hexdigits = "0123456789abcdef";
893   char *p = buffer + 21;
894   *p-- = '\0';
895   do {
896     *p-- = hexdigits[i & 15];   // mod by 16
897     i >>= 4;                    // divide by 16
898   } while (i > 0);
899   return p + 1;
900 }
901 
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)902 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
903   static const char *hexdigits = "0123456789abcdef";
904   buffer[num_byte] = '\0';
905   for (int i = num_byte - 1; i >= 0; i--) {
906 #ifdef _M_X64
907     // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
908     // block. Given that the uint32 cast was to improve performance on 32-bit
909     // platforms, we use 64-bit '&' directly.
910     buffer[i] = hexdigits[value & 0xf];
911 #else
912     buffer[i] = hexdigits[uint32(value) & 0xf];
913 #endif
914     value >>= 4;
915   }
916   return buffer;
917 }
918 
FastHex64ToBuffer(uint64 value,char * buffer)919 char *FastHex64ToBuffer(uint64 value, char* buffer) {
920   return InternalFastHexToBuffer(value, buffer, 16);
921 }
922 
FastHex32ToBuffer(uint32 value,char * buffer)923 char *FastHex32ToBuffer(uint32 value, char* buffer) {
924   return InternalFastHexToBuffer(value, buffer, 8);
925 }
926 
927 // ----------------------------------------------------------------------
928 // FastInt32ToBufferLeft()
929 // FastUInt32ToBufferLeft()
930 // FastInt64ToBufferLeft()
931 // FastUInt64ToBufferLeft()
932 //
933 // Like the Fast*ToBuffer() functions above, these are intended for speed.
934 // Unlike the Fast*ToBuffer() functions, however, these functions write
935 // their output to the beginning of the buffer (hence the name, as the
936 // output is left-aligned).  The caller is responsible for ensuring that
937 // the buffer has enough space to hold the output.
938 //
939 // Returns a pointer to the end of the string (i.e. the null character
940 // terminating the string).
941 // ----------------------------------------------------------------------
942 
943 static const char two_ASCII_digits[100][2] = {
944   {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
945   {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
946   {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
947   {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
948   {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
949   {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
950   {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
951   {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
952   {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
953   {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
954   {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
955   {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
956   {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
957   {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
958   {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
959   {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
960   {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
961   {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
962   {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
963   {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
964 };
965 
FastUInt32ToBufferLeft(uint32 u,char * buffer)966 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
967   uint32 digits;
968   const char *ASCII_digits = nullptr;
969   // The idea of this implementation is to trim the number of divides to as few
970   // as possible by using multiplication and subtraction rather than mod (%),
971   // and by outputting two digits at a time rather than one.
972   // The huge-number case is first, in the hopes that the compiler will output
973   // that case in one branch-free block of code, and only output conditional
974   // branches into it from below.
975   if (u >= 1000000000) {  // >= 1,000,000,000
976     digits = u / 100000000;  // 100,000,000
977     ASCII_digits = two_ASCII_digits[digits];
978     buffer[0] = ASCII_digits[0];
979     buffer[1] = ASCII_digits[1];
980     buffer += 2;
981 sublt100_000_000:
982     u -= digits * 100000000;  // 100,000,000
983 lt100_000_000:
984     digits = u / 1000000;  // 1,000,000
985     ASCII_digits = two_ASCII_digits[digits];
986     buffer[0] = ASCII_digits[0];
987     buffer[1] = ASCII_digits[1];
988     buffer += 2;
989 sublt1_000_000:
990     u -= digits * 1000000;  // 1,000,000
991 lt1_000_000:
992     digits = u / 10000;  // 10,000
993     ASCII_digits = two_ASCII_digits[digits];
994     buffer[0] = ASCII_digits[0];
995     buffer[1] = ASCII_digits[1];
996     buffer += 2;
997 sublt10_000:
998     u -= digits * 10000;  // 10,000
999 lt10_000:
1000     digits = u / 100;
1001     ASCII_digits = two_ASCII_digits[digits];
1002     buffer[0] = ASCII_digits[0];
1003     buffer[1] = ASCII_digits[1];
1004     buffer += 2;
1005 sublt100:
1006     u -= digits * 100;
1007 lt100:
1008     digits = u;
1009     ASCII_digits = two_ASCII_digits[digits];
1010     buffer[0] = ASCII_digits[0];
1011     buffer[1] = ASCII_digits[1];
1012     buffer += 2;
1013 done:
1014     *buffer = 0;
1015     return buffer;
1016   }
1017 
1018   if (u < 100) {
1019     digits = u;
1020     if (u >= 10) goto lt100;
1021     *buffer++ = '0' + digits;
1022     goto done;
1023   }
1024   if (u  <  10000) {   // 10,000
1025     if (u >= 1000) goto lt10_000;
1026     digits = u / 100;
1027     *buffer++ = '0' + digits;
1028     goto sublt100;
1029   }
1030   if (u  <  1000000) {   // 1,000,000
1031     if (u >= 100000) goto lt1_000_000;
1032     digits = u / 10000;  //    10,000
1033     *buffer++ = '0' + digits;
1034     goto sublt10_000;
1035   }
1036   if (u  <  100000000) {   // 100,000,000
1037     if (u >= 10000000) goto lt100_000_000;
1038     digits = u / 1000000;  //   1,000,000
1039     *buffer++ = '0' + digits;
1040     goto sublt1_000_000;
1041   }
1042   // we already know that u < 1,000,000,000
1043   digits = u / 100000000;   // 100,000,000
1044   *buffer++ = '0' + digits;
1045   goto sublt100_000_000;
1046 }
1047 
FastInt32ToBufferLeft(int32 i,char * buffer)1048 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1049   uint32 u = 0;
1050   if (i < 0) {
1051     *buffer++ = '-';
1052     u -= i;
1053   } else {
1054     u = i;
1055   }
1056   return FastUInt32ToBufferLeft(u, buffer);
1057 }
1058 
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1059 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1060   int digits;
1061   const char *ASCII_digits = nullptr;
1062 
1063   uint32 u = static_cast<uint32>(u64);
1064   if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1065 
1066   uint64 top_11_digits = u64 / 1000000000;
1067   buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1068   u = u64 - (top_11_digits * 1000000000);
1069 
1070   digits = u / 10000000;  // 10,000,000
1071   GOOGLE_DCHECK_LT(digits, 100);
1072   ASCII_digits = two_ASCII_digits[digits];
1073   buffer[0] = ASCII_digits[0];
1074   buffer[1] = ASCII_digits[1];
1075   buffer += 2;
1076   u -= digits * 10000000;  // 10,000,000
1077   digits = u / 100000;  // 100,000
1078   ASCII_digits = two_ASCII_digits[digits];
1079   buffer[0] = ASCII_digits[0];
1080   buffer[1] = ASCII_digits[1];
1081   buffer += 2;
1082   u -= digits * 100000;  // 100,000
1083   digits = u / 1000;  // 1,000
1084   ASCII_digits = two_ASCII_digits[digits];
1085   buffer[0] = ASCII_digits[0];
1086   buffer[1] = ASCII_digits[1];
1087   buffer += 2;
1088   u -= digits * 1000;  // 1,000
1089   digits = u / 10;
1090   ASCII_digits = two_ASCII_digits[digits];
1091   buffer[0] = ASCII_digits[0];
1092   buffer[1] = ASCII_digits[1];
1093   buffer += 2;
1094   u -= digits * 10;
1095   digits = u;
1096   *buffer++ = '0' + digits;
1097   *buffer = 0;
1098   return buffer;
1099 }
1100 
FastInt64ToBufferLeft(int64 i,char * buffer)1101 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1102   uint64 u = 0;
1103   if (i < 0) {
1104     *buffer++ = '-';
1105     u -= i;
1106   } else {
1107     u = i;
1108   }
1109   return FastUInt64ToBufferLeft(u, buffer);
1110 }
1111 
1112 // ----------------------------------------------------------------------
1113 // SimpleItoa()
1114 //    Description: converts an integer to a string.
1115 //
1116 //    Return value: string
1117 // ----------------------------------------------------------------------
1118 
SimpleItoa(int i)1119 string SimpleItoa(int i) {
1120   char buffer[kFastToBufferSize];
1121   return (sizeof(i) == 4) ?
1122     FastInt32ToBuffer(i, buffer) :
1123     FastInt64ToBuffer(i, buffer);
1124 }
1125 
SimpleItoa(unsigned int i)1126 string SimpleItoa(unsigned int i) {
1127   char buffer[kFastToBufferSize];
1128   return string(buffer, (sizeof(i) == 4) ?
1129     FastUInt32ToBufferLeft(i, buffer) :
1130     FastUInt64ToBufferLeft(i, buffer));
1131 }
1132 
SimpleItoa(long i)1133 string SimpleItoa(long i) {
1134   char buffer[kFastToBufferSize];
1135   return (sizeof(i) == 4) ?
1136     FastInt32ToBuffer(i, buffer) :
1137     FastInt64ToBuffer(i, buffer);
1138 }
1139 
SimpleItoa(unsigned long i)1140 string SimpleItoa(unsigned long i) {
1141   char buffer[kFastToBufferSize];
1142   return string(buffer, (sizeof(i) == 4) ?
1143     FastUInt32ToBufferLeft(i, buffer) :
1144     FastUInt64ToBufferLeft(i, buffer));
1145 }
1146 
SimpleItoa(long long i)1147 string SimpleItoa(long long i) {
1148   char buffer[kFastToBufferSize];
1149   return (sizeof(i) == 4) ?
1150     FastInt32ToBuffer(i, buffer) :
1151     FastInt64ToBuffer(i, buffer);
1152 }
1153 
SimpleItoa(unsigned long long i)1154 string SimpleItoa(unsigned long long i) {
1155   char buffer[kFastToBufferSize];
1156   return string(buffer, (sizeof(i) == 4) ?
1157     FastUInt32ToBufferLeft(i, buffer) :
1158     FastUInt64ToBufferLeft(i, buffer));
1159 }
1160 
1161 // ----------------------------------------------------------------------
1162 // SimpleDtoa()
1163 // SimpleFtoa()
1164 // DoubleToBuffer()
1165 // FloatToBuffer()
1166 //    We want to print the value without losing precision, but we also do
1167 //    not want to print more digits than necessary.  This turns out to be
1168 //    trickier than it sounds.  Numbers like 0.2 cannot be represented
1169 //    exactly in binary.  If we print 0.2 with a very large precision,
1170 //    e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1171 //    On the other hand, if we set the precision too low, we lose
1172 //    significant digits when printing numbers that actually need them.
1173 //    It turns out there is no precision value that does the right thing
1174 //    for all numbers.
1175 //
1176 //    Our strategy is to first try printing with a precision that is never
1177 //    over-precise, then parse the result with strtod() to see if it
1178 //    matches.  If not, we print again with a precision that will always
1179 //    give a precise result, but may use more digits than necessary.
1180 //
1181 //    An arguably better strategy would be to use the algorithm described
1182 //    in "How to Print Floating-Point Numbers Accurately" by Steele &
1183 //    White, e.g. as implemented by David M. Gay's dtoa().  It turns out,
1184 //    however, that the following implementation is about as fast as
1185 //    DMG's code.  Furthermore, DMG's code locks mutexes, which means it
1186 //    will not scale well on multi-core machines.  DMG's code is slightly
1187 //    more accurate (in that it will never use more digits than
1188 //    necessary), but this is probably irrelevant for most users.
1189 //
1190 //    Rob Pike and Ken Thompson also have an implementation of dtoa() in
1191 //    third_party/fmt/fltfmt.cc.  Their implementation is similar to this
1192 //    one in that it makes guesses and then uses strtod() to check them.
1193 //    Their implementation is faster because they use their own code to
1194 //    generate the digits in the first place rather than use snprintf(),
1195 //    thus avoiding format string parsing overhead.  However, this makes
1196 //    it considerably more complicated than the following implementation,
1197 //    and it is embedded in a larger library.  If speed turns out to be
1198 //    an issue, we could re-implement this in terms of their
1199 //    implementation.
1200 // ----------------------------------------------------------------------
1201 
SimpleDtoa(double value)1202 string SimpleDtoa(double value) {
1203   char buffer[kDoubleToBufferSize];
1204   return DoubleToBuffer(value, buffer);
1205 }
1206 
SimpleFtoa(float value)1207 string SimpleFtoa(float value) {
1208   char buffer[kFloatToBufferSize];
1209   return FloatToBuffer(value, buffer);
1210 }
1211 
IsValidFloatChar(char c)1212 static inline bool IsValidFloatChar(char c) {
1213   return ('0' <= c && c <= '9') ||
1214          c == 'e' || c == 'E' ||
1215          c == '+' || c == '-';
1216 }
1217 
DelocalizeRadix(char * buffer)1218 void DelocalizeRadix(char* buffer) {
1219   // Fast check:  if the buffer has a normal decimal point, assume no
1220   // translation is needed.
1221   if (strchr(buffer, '.') != nullptr) return;
1222 
1223   // Find the first unknown character.
1224   while (IsValidFloatChar(*buffer)) ++buffer;
1225 
1226   if (*buffer == '\0') {
1227     // No radix character found.
1228     return;
1229   }
1230 
1231   // We are now pointing at the locale-specific radix character.  Replace it
1232   // with '.'.
1233   *buffer = '.';
1234   ++buffer;
1235 
1236   if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1237     // It appears the radix was a multi-byte character.  We need to remove the
1238     // extra bytes.
1239     char* target = buffer;
1240     do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1241     memmove(target, buffer, strlen(buffer) + 1);
1242   }
1243 }
1244 
DoubleToBuffer(double value,char * buffer)1245 char* DoubleToBuffer(double value, char* buffer) {
1246   // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1247   // platforms these days.  Just in case some system exists where DBL_DIG
1248   // is significantly larger -- and risks overflowing our buffer -- we have
1249   // this assert.
1250   GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1251 
1252   if (value == std::numeric_limits<double>::infinity()) {
1253     strcpy(buffer, "inf");
1254     return buffer;
1255   } else if (value == -std::numeric_limits<double>::infinity()) {
1256     strcpy(buffer, "-inf");
1257     return buffer;
1258   } else if (std::isnan(value)) {
1259     strcpy(buffer, "nan");
1260     return buffer;
1261   }
1262 
1263   int snprintf_result =
1264     snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1265 
1266   // The snprintf should never overflow because the buffer is significantly
1267   // larger than the precision we asked for.
1268   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1269 
1270   // We need to make parsed_value volatile in order to force the compiler to
1271   // write it out to the stack.  Otherwise, it may keep the value in a
1272   // register, and if it does that, it may keep it as a long double instead
1273   // of a double.  This long double may have extra bits that make it compare
1274   // unequal to "value" even though it would be exactly equal if it were
1275   // truncated to a double.
1276   volatile double parsed_value = internal::NoLocaleStrtod(buffer, nullptr);
1277   if (parsed_value != value) {
1278     int snprintf_result =
1279       snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1280 
1281     // Should never overflow; see above.
1282     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1283   }
1284 
1285   DelocalizeRadix(buffer);
1286   return buffer;
1287 }
1288 
memcasecmp(const char * s1,const char * s2,size_t len)1289 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1290   const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1291   const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1292 
1293   for ( int i = 0; i < len; i++ ) {
1294     const int diff =
1295       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1296       static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1297     if (diff != 0) return diff;
1298   }
1299   return 0;
1300 }
1301 
CaseEqual(StringPiece s1,StringPiece s2)1302 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1303   if (s1.size() != s2.size()) return false;
1304   return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1305 }
1306 
safe_strtob(StringPiece str,bool * value)1307 bool safe_strtob(StringPiece str, bool* value) {
1308   GOOGLE_CHECK(value != nullptr) << "nullptr output boolean given.";
1309   if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1310       CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1311       CaseEqual(str, "1")) {
1312     *value = true;
1313     return true;
1314   }
1315   if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1316       CaseEqual(str, "no") || CaseEqual(str, "n") ||
1317       CaseEqual(str, "0")) {
1318     *value = false;
1319     return true;
1320   }
1321   return false;
1322 }
1323 
safe_strtof(const char * str,float * value)1324 bool safe_strtof(const char* str, float* value) {
1325   char* endptr;
1326   errno = 0;  // errno only gets set on errors
1327 #if defined(_WIN32) || defined (__hpux)  // has no strtof()
1328   *value = internal::NoLocaleStrtod(str, &endptr);
1329 #else
1330   *value = strtof(str, &endptr);
1331 #endif
1332   return *str != 0 && *endptr == 0 && errno == 0;
1333 }
1334 
safe_strtod(const char * str,double * value)1335 bool safe_strtod(const char* str, double* value) {
1336   char* endptr;
1337   *value = internal::NoLocaleStrtod(str, &endptr);
1338   if (endptr != str) {
1339     while (ascii_isspace(*endptr)) ++endptr;
1340   }
1341   // Ignore range errors from strtod.  The values it
1342   // returns on underflow and overflow are the right
1343   // fallback in a robust setting.
1344   return *str != '\0' && *endptr == '\0';
1345 }
1346 
safe_strto32(const string & str,int32 * value)1347 bool safe_strto32(const string& str, int32* value) {
1348   return safe_int_internal(str, value);
1349 }
1350 
safe_strtou32(const string & str,uint32 * value)1351 bool safe_strtou32(const string& str, uint32* value) {
1352   return safe_uint_internal(str, value);
1353 }
1354 
safe_strto64(const string & str,int64 * value)1355 bool safe_strto64(const string& str, int64* value) {
1356   return safe_int_internal(str, value);
1357 }
1358 
safe_strtou64(const string & str,uint64 * value)1359 bool safe_strtou64(const string& str, uint64* value) {
1360   return safe_uint_internal(str, value);
1361 }
1362 
FloatToBuffer(float value,char * buffer)1363 char* FloatToBuffer(float value, char* buffer) {
1364   // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1365   // platforms these days.  Just in case some system exists where FLT_DIG
1366   // is significantly larger -- and risks overflowing our buffer -- we have
1367   // this assert.
1368   GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1369 
1370   if (value == std::numeric_limits<double>::infinity()) {
1371     strcpy(buffer, "inf");
1372     return buffer;
1373   } else if (value == -std::numeric_limits<double>::infinity()) {
1374     strcpy(buffer, "-inf");
1375     return buffer;
1376   } else if (std::isnan(value)) {
1377     strcpy(buffer, "nan");
1378     return buffer;
1379   }
1380 
1381   int snprintf_result =
1382     snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1383 
1384   // The snprintf should never overflow because the buffer is significantly
1385   // larger than the precision we asked for.
1386   GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1387 
1388   float parsed_value;
1389   if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1390     int snprintf_result =
1391       snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value);
1392 
1393     // Should never overflow; see above.
1394     GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1395   }
1396 
1397   DelocalizeRadix(buffer);
1398   return buffer;
1399 }
1400 
1401 namespace strings {
1402 
AlphaNum(strings::Hex hex)1403 AlphaNum::AlphaNum(strings::Hex hex) {
1404   char *const end = &digits[kFastToBufferSize];
1405   char *writer = end;
1406   uint64 value = hex.value;
1407   uint64 width = hex.spec;
1408   // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1409   // where 0x10000 is the smallest hex number that is as wide as the user
1410   // asked for.
1411   uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1412   static const char hexdigits[] = "0123456789abcdef";
1413   do {
1414     *--writer = hexdigits[value & 0xF];
1415     value >>= 4;
1416     mask >>= 4;
1417   } while (mask != 0);
1418   piece_data_ = writer;
1419   piece_size_ = end - writer;
1420 }
1421 
1422 }  // namespace strings
1423 
1424 // ----------------------------------------------------------------------
1425 // StrCat()
1426 //    This merges the given strings or integers, with no delimiter.  This
1427 //    is designed to be the fastest possible way to construct a string out
1428 //    of a mix of raw C strings, C++ strings, and integer values.
1429 // ----------------------------------------------------------------------
1430 
1431 // Append is merely a version of memcpy that returns the address of the byte
1432 // after the area just overwritten.  It comes in multiple flavors to minimize
1433 // call overhead.
Append1(char * out,const AlphaNum & x)1434 static char *Append1(char *out, const AlphaNum &x) {
1435   if (x.size() > 0) {
1436     memcpy(out, x.data(), x.size());
1437     out += x.size();
1438   }
1439   return out;
1440 }
1441 
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1442 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1443   if (x1.size() > 0) {
1444     memcpy(out, x1.data(), x1.size());
1445     out += x1.size();
1446   }
1447   if (x2.size() > 0) {
1448     memcpy(out, x2.data(), x2.size());
1449     out += x2.size();
1450   }
1451   return out;
1452 }
1453 
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1454 static char *Append4(char *out, const AlphaNum &x1, const AlphaNum &x2,
1455                      const AlphaNum &x3, const AlphaNum &x4) {
1456   if (x1.size() > 0) {
1457     memcpy(out, x1.data(), x1.size());
1458     out += x1.size();
1459   }
1460   if (x2.size() > 0) {
1461     memcpy(out, x2.data(), x2.size());
1462     out += x2.size();
1463   }
1464   if (x3.size() > 0) {
1465     memcpy(out, x3.data(), x3.size());
1466     out += x3.size();
1467   }
1468   if (x4.size() > 0) {
1469     memcpy(out, x4.data(), x4.size());
1470     out += x4.size();
1471   }
1472   return out;
1473 }
1474 
StrCat(const AlphaNum & a,const AlphaNum & b)1475 string StrCat(const AlphaNum &a, const AlphaNum &b) {
1476   string result;
1477   result.resize(a.size() + b.size());
1478   char *const begin = &*result.begin();
1479   char *out = Append2(begin, a, b);
1480   GOOGLE_DCHECK_EQ(out, begin + result.size());
1481   return result;
1482 }
1483 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1484 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1485   string result;
1486   result.resize(a.size() + b.size() + c.size());
1487   char *const begin = &*result.begin();
1488   char *out = Append2(begin, a, b);
1489   out = Append1(out, c);
1490   GOOGLE_DCHECK_EQ(out, begin + result.size());
1491   return result;
1492 }
1493 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1494 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1495               const AlphaNum &d) {
1496   string result;
1497   result.resize(a.size() + b.size() + c.size() + d.size());
1498   char *const begin = &*result.begin();
1499   char *out = Append4(begin, a, b, c, d);
1500   GOOGLE_DCHECK_EQ(out, begin + result.size());
1501   return result;
1502 }
1503 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1504 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1505               const AlphaNum &d, const AlphaNum &e) {
1506   string result;
1507   result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1508   char *const begin = &*result.begin();
1509   char *out = Append4(begin, a, b, c, d);
1510   out = Append1(out, e);
1511   GOOGLE_DCHECK_EQ(out, begin + result.size());
1512   return result;
1513 }
1514 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1515 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1516               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1517   string result;
1518   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1519                 f.size());
1520   char *const begin = &*result.begin();
1521   char *out = Append4(begin, a, b, c, d);
1522   out = Append2(out, e, f);
1523   GOOGLE_DCHECK_EQ(out, begin + result.size());
1524   return result;
1525 }
1526 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1527 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1528               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1529               const AlphaNum &g) {
1530   string result;
1531   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1532                 f.size() + g.size());
1533   char *const begin = &*result.begin();
1534   char *out = Append4(begin, a, b, c, d);
1535   out = Append2(out, e, f);
1536   out = Append1(out, g);
1537   GOOGLE_DCHECK_EQ(out, begin + result.size());
1538   return result;
1539 }
1540 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1541 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1542               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1543               const AlphaNum &g, const AlphaNum &h) {
1544   string result;
1545   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1546                 f.size() + g.size() + h.size());
1547   char *const begin = &*result.begin();
1548   char *out = Append4(begin, a, b, c, d);
1549   out = Append4(out, e, f, g, h);
1550   GOOGLE_DCHECK_EQ(out, begin + result.size());
1551   return result;
1552 }
1553 
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1554 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1555               const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1556               const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1557   string result;
1558   result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1559                 f.size() + g.size() + h.size() + i.size());
1560   char *const begin = &*result.begin();
1561   char *out = Append4(begin, a, b, c, d);
1562   out = Append4(out, e, f, g, h);
1563   out = Append1(out, i);
1564   GOOGLE_DCHECK_EQ(out, begin + result.size());
1565   return result;
1566 }
1567 
1568 // It's possible to call StrAppend with a char * pointer that is partway into
1569 // the string we're appending to.  However the results of this are random.
1570 // Therefore, check for this in debug mode.  Use unsigned math so we only have
1571 // to do one comparison.
1572 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1573     GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1574                      uintptr_t((dest).size()))
1575 
StrAppend(string * result,const AlphaNum & a)1576 void StrAppend(string *result, const AlphaNum &a) {
1577   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1578   result->append(a.data(), a.size());
1579 }
1580 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b)1581 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1582   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1583   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1584   string::size_type old_size = result->size();
1585   result->resize(old_size + a.size() + b.size());
1586   char *const begin = &*result->begin();
1587   char *out = Append2(begin + old_size, a, b);
1588   GOOGLE_DCHECK_EQ(out, begin + result->size());
1589 }
1590 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1591 void StrAppend(string *result,
1592                const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1593   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1594   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1595   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1596   string::size_type old_size = result->size();
1597   result->resize(old_size + a.size() + b.size() + c.size());
1598   char *const begin = &*result->begin();
1599   char *out = Append2(begin + old_size, a, b);
1600   out = Append1(out, c);
1601   GOOGLE_DCHECK_EQ(out, begin + result->size());
1602 }
1603 
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1604 void StrAppend(string *result,
1605                const AlphaNum &a, const AlphaNum &b,
1606                const AlphaNum &c, const AlphaNum &d) {
1607   GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1608   GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1609   GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1610   GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1611   string::size_type old_size = result->size();
1612   result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1613   char *const begin = &*result->begin();
1614   char *out = Append4(begin + old_size, a, b, c, d);
1615   GOOGLE_DCHECK_EQ(out, begin + result->size());
1616 }
1617 
GlobalReplaceSubstring(const string & substring,const string & replacement,string * s)1618 int GlobalReplaceSubstring(const string& substring,
1619                            const string& replacement,
1620                            string* s) {
1621   GOOGLE_CHECK(s != nullptr);
1622   if (s->empty() || substring.empty())
1623     return 0;
1624   string tmp;
1625   int num_replacements = 0;
1626   int pos = 0;
1627   for (int match_pos = s->find(substring.data(), pos, substring.length());
1628        match_pos != string::npos;
1629        pos = match_pos + substring.length(),
1630            match_pos = s->find(substring.data(), pos, substring.length())) {
1631     ++num_replacements;
1632     // Append the original content before the match.
1633     tmp.append(*s, pos, match_pos - pos);
1634     // Append the replacement for the match.
1635     tmp.append(replacement.begin(), replacement.end());
1636   }
1637   // Append the content after the last match. If no replacements were made, the
1638   // original string is left untouched.
1639   if (num_replacements > 0) {
1640     tmp.append(*s, pos, s->length() - pos);
1641     s->swap(tmp);
1642   }
1643   return num_replacements;
1644 }
1645 
CalculateBase64EscapedLen(int input_len,bool do_padding)1646 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1647   // Base64 encodes three bytes of input at a time. If the input is not
1648   // divisible by three, we pad as appropriate.
1649   //
1650   // (from http://tools.ietf.org/html/rfc3548)
1651   // Special processing is performed if fewer than 24 bits are available
1652   // at the end of the data being encoded.  A full encoding quantum is
1653   // always completed at the end of a quantity.  When fewer than 24 input
1654   // bits are available in an input group, zero bits are added (on the
1655   // right) to form an integral number of 6-bit groups.  Padding at the
1656   // end of the data is performed using the '=' character.  Since all base
1657   // 64 input is an integral number of octets, only the following cases
1658   // can arise:
1659 
1660 
1661   // Base64 encodes each three bytes of input into four bytes of output.
1662   int len = (input_len / 3) * 4;
1663 
1664   if (input_len % 3 == 0) {
1665     // (from http://tools.ietf.org/html/rfc3548)
1666     // (1) the final quantum of encoding input is an integral multiple of 24
1667     // bits; here, the final unit of encoded output will be an integral
1668     // multiple of 4 characters with no "=" padding,
1669   } else if (input_len % 3 == 1) {
1670     // (from http://tools.ietf.org/html/rfc3548)
1671     // (2) the final quantum of encoding input is exactly 8 bits; here, the
1672     // final unit of encoded output will be two characters followed by two
1673     // "=" padding characters, or
1674     len += 2;
1675     if (do_padding) {
1676       len += 2;
1677     }
1678   } else {  // (input_len % 3 == 2)
1679     // (from http://tools.ietf.org/html/rfc3548)
1680     // (3) the final quantum of encoding input is exactly 16 bits; here, the
1681     // final unit of encoded output will be three characters followed by one
1682     // "=" padding character.
1683     len += 3;
1684     if (do_padding) {
1685       len += 1;
1686     }
1687   }
1688 
1689   assert(len >= input_len);  // make sure we didn't overflow
1690   return len;
1691 }
1692 
1693 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1694 int CalculateBase64EscapedLen(int input_len) {
1695   return CalculateBase64EscapedLen(input_len, true);
1696 }
1697 
1698 // ----------------------------------------------------------------------
1699 // int Base64Unescape() - base64 decoder
1700 // int Base64Escape() - base64 encoder
1701 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1702 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1703 //
1704 // Check out
1705 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1706 // care about is that...
1707 //   Take the encoded stuff in groups of 4 characters and turn each
1708 //   character into a code 0 to 63 thus:
1709 //           A-Z map to 0 to 25
1710 //           a-z map to 26 to 51
1711 //           0-9 map to 52 to 61
1712 //           +(- for WebSafe) maps to 62
1713 //           /(_ for WebSafe) maps to 63
1714 //   There will be four numbers, all less than 64 which can be represented
1715 //   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1716 //   Arrange the 6 digit binary numbers into three bytes as such:
1717 //   aaaaaabb bbbbcccc ccdddddd
1718 //   Equals signs (one or two) are used at the end of the encoded block to
1719 //   indicate that the text was not an integer multiple of three bytes long.
1720 // ----------------------------------------------------------------------
1721 
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1722 int Base64UnescapeInternal(const char *src_param, int szsrc,
1723                            char *dest, int szdest,
1724                            const signed char* unbase64) {
1725   static const char kPad64Equals = '=';
1726   static const char kPad64Dot = '.';
1727 
1728   int decode = 0;
1729   int destidx = 0;
1730   int state = 0;
1731   unsigned int ch = 0;
1732   unsigned int temp = 0;
1733 
1734   // If "char" is signed by default, using *src as an array index results in
1735   // accessing negative array elements. Treat the input as a pointer to
1736   // unsigned char to avoid this.
1737   const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1738 
1739   // The GET_INPUT macro gets the next input character, skipping
1740   // over any whitespace, and stopping when we reach the end of the
1741   // string or when we read any non-data character.  The arguments are
1742   // an arbitrary identifier (used as a label for goto) and the number
1743   // of data bytes that must remain in the input to avoid aborting the
1744   // loop.
1745 #define GET_INPUT(label, remain)                 \
1746   label:                                         \
1747     --szsrc;                                     \
1748     ch = *src++;                                 \
1749     decode = unbase64[ch];                       \
1750     if (decode < 0) {                            \
1751       if (ascii_isspace(ch) && szsrc >= remain)  \
1752         goto label;                              \
1753       state = 4 - remain;                        \
1754       break;                                     \
1755     }
1756 
1757   // if dest is null, we're just checking to see if it's legal input
1758   // rather than producing output.  (I suspect this could just be done
1759   // with a regexp...).  We duplicate the loop so this test can be
1760   // outside it instead of in every iteration.
1761 
1762   if (dest) {
1763     // This loop consumes 4 input bytes and produces 3 output bytes
1764     // per iteration.  We can't know at the start that there is enough
1765     // data left in the string for a full iteration, so the loop may
1766     // break out in the middle; if so 'state' will be set to the
1767     // number of input bytes read.
1768 
1769     while (szsrc >= 4)  {
1770       // We'll start by optimistically assuming that the next four
1771       // bytes of the string (src[0..3]) are four good data bytes
1772       // (that is, no nulls, whitespace, padding chars, or illegal
1773       // chars).  We need to test src[0..2] for nulls individually
1774       // before constructing temp to preserve the property that we
1775       // never read past a null in the string (no matter how long
1776       // szsrc claims the string is).
1777 
1778       if (!src[0] || !src[1] || !src[2] ||
1779           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1780                    (unsigned(unbase64[src[1]]) << 12) |
1781                    (unsigned(unbase64[src[2]]) << 6) |
1782                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1783         // Iff any of those four characters was bad (null, illegal,
1784         // whitespace, padding), then temp's high bit will be set
1785         // (because unbase64[] is -1 for all bad characters).
1786         //
1787         // We'll back up and resort to the slower decoder, which knows
1788         // how to handle those cases.
1789 
1790         GET_INPUT(first, 4);
1791         temp = decode;
1792         GET_INPUT(second, 3);
1793         temp = (temp << 6) | decode;
1794         GET_INPUT(third, 2);
1795         temp = (temp << 6) | decode;
1796         GET_INPUT(fourth, 1);
1797         temp = (temp << 6) | decode;
1798       } else {
1799         // We really did have four good data bytes, so advance four
1800         // characters in the string.
1801 
1802         szsrc -= 4;
1803         src += 4;
1804         decode = -1;
1805         ch = '\0';
1806       }
1807 
1808       // temp has 24 bits of input, so write that out as three bytes.
1809 
1810       if (destidx+3 > szdest) return -1;
1811       dest[destidx+2] = temp;
1812       temp >>= 8;
1813       dest[destidx+1] = temp;
1814       temp >>= 8;
1815       dest[destidx] = temp;
1816       destidx += 3;
1817     }
1818   } else {
1819     while (szsrc >= 4)  {
1820       if (!src[0] || !src[1] || !src[2] ||
1821           (temp = ((unsigned(unbase64[src[0]]) << 18) |
1822                    (unsigned(unbase64[src[1]]) << 12) |
1823                    (unsigned(unbase64[src[2]]) << 6) |
1824                    (unsigned(unbase64[src[3]])))) & 0x80000000) {
1825         GET_INPUT(first_no_dest, 4);
1826         GET_INPUT(second_no_dest, 3);
1827         GET_INPUT(third_no_dest, 2);
1828         GET_INPUT(fourth_no_dest, 1);
1829       } else {
1830         szsrc -= 4;
1831         src += 4;
1832         decode = -1;
1833         ch = '\0';
1834       }
1835       destidx += 3;
1836     }
1837   }
1838 
1839 #undef GET_INPUT
1840 
1841   // if the loop terminated because we read a bad character, return
1842   // now.
1843   if (decode < 0 && ch != '\0' &&
1844       ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1845     return -1;
1846 
1847   if (ch == kPad64Equals || ch == kPad64Dot) {
1848     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1849     // look at it again when we count to check for the proper number of
1850     // equals signs at the end.
1851     ++szsrc;
1852     --src;
1853   } else {
1854     // This loop consumes 1 input byte per iteration.  It's used to
1855     // clean up the 0-3 input bytes remaining when the first, faster
1856     // loop finishes.  'temp' contains the data from 'state' input
1857     // characters read by the first loop.
1858     while (szsrc > 0)  {
1859       --szsrc;
1860       ch = *src++;
1861       decode = unbase64[ch];
1862       if (decode < 0) {
1863         if (ascii_isspace(ch)) {
1864           continue;
1865         } else if (ch == '\0') {
1866           break;
1867         } else if (ch == kPad64Equals || ch == kPad64Dot) {
1868           // back up one character; we'll read it again when we check
1869           // for the correct number of pad characters at the end.
1870           ++szsrc;
1871           --src;
1872           break;
1873         } else {
1874           return -1;
1875         }
1876       }
1877 
1878       // Each input character gives us six bits of output.
1879       temp = (temp << 6) | decode;
1880       ++state;
1881       if (state == 4) {
1882         // If we've accumulated 24 bits of output, write that out as
1883         // three bytes.
1884         if (dest) {
1885           if (destidx+3 > szdest) return -1;
1886           dest[destidx+2] = temp;
1887           temp >>= 8;
1888           dest[destidx+1] = temp;
1889           temp >>= 8;
1890           dest[destidx] = temp;
1891         }
1892         destidx += 3;
1893         state = 0;
1894         temp = 0;
1895       }
1896     }
1897   }
1898 
1899   // Process the leftover data contained in 'temp' at the end of the input.
1900   int expected_equals = 0;
1901   switch (state) {
1902     case 0:
1903       // Nothing left over; output is a multiple of 3 bytes.
1904       break;
1905 
1906     case 1:
1907       // Bad input; we have 6 bits left over.
1908       return -1;
1909 
1910     case 2:
1911       // Produce one more output byte from the 12 input bits we have left.
1912       if (dest) {
1913         if (destidx+1 > szdest) return -1;
1914         temp >>= 4;
1915         dest[destidx] = temp;
1916       }
1917       ++destidx;
1918       expected_equals = 2;
1919       break;
1920 
1921     case 3:
1922       // Produce two more output bytes from the 18 input bits we have left.
1923       if (dest) {
1924         if (destidx+2 > szdest) return -1;
1925         temp >>= 2;
1926         dest[destidx+1] = temp;
1927         temp >>= 8;
1928         dest[destidx] = temp;
1929       }
1930       destidx += 2;
1931       expected_equals = 1;
1932       break;
1933 
1934     default:
1935       // state should have no other values at this point.
1936       GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1937   }
1938 
1939   // The remainder of the string should be all whitespace, mixed with
1940   // exactly 0 equals signs, or exactly 'expected_equals' equals
1941   // signs.  (Always accepting 0 equals signs is a google extension
1942   // not covered in the RFC, as is accepting dot as the pad character.)
1943 
1944   int equals = 0;
1945   while (szsrc > 0 && *src) {
1946     if (*src == kPad64Equals || *src == kPad64Dot)
1947       ++equals;
1948     else if (!ascii_isspace(*src))
1949       return -1;
1950     --szsrc;
1951     ++src;
1952   }
1953 
1954   return (equals == 0 || equals == expected_equals) ? destidx : -1;
1955 }
1956 
1957 // The arrays below were generated by the following code
1958 // #include <sys/time.h>
1959 // #include <stdlib.h>
1960 // #include <string.h>
1961 // #include <stdio.h>
1962 // main()
1963 // {
1964 //   static const char Base64[] =
1965 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1966 //   const char *pos;
1967 //   int idx, i, j;
1968 //   printf("    ");
1969 //   for (i = 0; i < 255; i += 8) {
1970 //     for (j = i; j < i + 8; j++) {
1971 //       pos = strchr(Base64, j);
1972 //       if ((pos == nullptr) || (j == 0))
1973 //         idx = -1;
1974 //       else
1975 //         idx = pos - Base64;
1976 //       if (idx == -1)
1977 //         printf(" %2d,     ", idx);
1978 //       else
1979 //         printf(" %2d/""*%c*""/,", idx, j);
1980 //     }
1981 //     printf("\n    ");
1982 //   }
1983 // }
1984 //
1985 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1986 // tables from the functions below.
1987 static const signed char kUnBase64[] = {
1988   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1989   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1990   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1991   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1992   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
1993   -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
1994   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
1995   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
1996   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
1997    7/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
1998   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
1999   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
2000   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2001   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2002   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2003   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
2004   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2005   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2006   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2007   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2008   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2009   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2010   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2011   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2012   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2013   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2014   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2015   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2016   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2017   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2018   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2019   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2020 };
2021 static const signed char kUnWebSafeBase64[] = {
2022   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2023   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2024   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2025   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2026   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2027   -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
2028   52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2029   60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
2030   -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
2031    7/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2032   15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2033   23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
2034   -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2035   33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2036   41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2037   49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
2038   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2039   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2040   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2041   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2042   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2043   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2044   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2045   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2046   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2047   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2048   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2049   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2050   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2051   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2052   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
2053   -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
2054 };
2055 
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2056 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2057   return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2058 }
2059 
Base64UnescapeInternal(const char * src,int slen,string * dest,const signed char * unbase64)2060 static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
2061                                    const signed char* unbase64) {
2062   // Determine the size of the output string.  Base64 encodes every 3 bytes into
2063   // 4 characters.  any leftover chars are added directly for good measure.
2064   // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2065   const int dest_len = 3 * (slen / 4) + (slen % 4);
2066 
2067   dest->resize(dest_len);
2068 
2069   // We are getting the destination buffer by getting the beginning of the
2070   // string and converting it into a char *.
2071   const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2072                                          dest_len, unbase64);
2073   if (len < 0) {
2074     dest->clear();
2075     return false;
2076   }
2077 
2078   // could be shorter if there was padding
2079   GOOGLE_DCHECK_LE(len, dest_len);
2080   dest->erase(len);
2081 
2082   return true;
2083 }
2084 
Base64Unescape(StringPiece src,string * dest)2085 bool Base64Unescape(StringPiece src, string* dest) {
2086   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2087 }
2088 
WebSafeBase64Unescape(StringPiece src,string * dest)2089 bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2090   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2091 }
2092 
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2093 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2094                          char *dest, int szdest, const char *base64,
2095                          bool do_padding) {
2096   static const char kPad64 = '=';
2097 
2098   if (szsrc <= 0) return 0;
2099 
2100   if (szsrc * 4 > szdest * 3) return 0;
2101 
2102   char *cur_dest = dest;
2103   const unsigned char *cur_src = src;
2104 
2105   char *limit_dest = dest + szdest;
2106   const unsigned char *limit_src = src + szsrc;
2107 
2108   // Three bytes of data encodes to four characters of cyphertext.
2109   // So we can pump through three-byte chunks atomically.
2110   while (cur_src < limit_src - 3) {  // keep going as long as we have >= 32 bits
2111     uint32 in = BigEndian::Load32(cur_src) >> 8;
2112 
2113     cur_dest[0] = base64[in >> 18];
2114     in &= 0x3FFFF;
2115     cur_dest[1] = base64[in >> 12];
2116     in &= 0xFFF;
2117     cur_dest[2] = base64[in >> 6];
2118     in &= 0x3F;
2119     cur_dest[3] = base64[in];
2120 
2121     cur_dest += 4;
2122     cur_src += 3;
2123   }
2124   // To save time, we didn't update szdest or szsrc in the loop.  So do it now.
2125   szdest = limit_dest - cur_dest;
2126   szsrc = limit_src - cur_src;
2127 
2128   /* now deal with the tail (<=3 bytes) */
2129   switch (szsrc) {
2130     case 0:
2131       // Nothing left; nothing more to do.
2132       break;
2133     case 1: {
2134       // One byte left: this encodes to two characters, and (optionally)
2135       // two pad characters to round out the four-character cypherblock.
2136       if ((szdest -= 2) < 0) return 0;
2137       uint32 in = cur_src[0];
2138       cur_dest[0] = base64[in >> 2];
2139       in &= 0x3;
2140       cur_dest[1] = base64[in << 4];
2141       cur_dest += 2;
2142       if (do_padding) {
2143         if ((szdest -= 2) < 0) return 0;
2144         cur_dest[0] = kPad64;
2145         cur_dest[1] = kPad64;
2146         cur_dest += 2;
2147       }
2148       break;
2149     }
2150     case 2: {
2151       // Two bytes left: this encodes to three characters, and (optionally)
2152       // one pad character to round out the four-character cypherblock.
2153       if ((szdest -= 3) < 0) return 0;
2154       uint32 in = BigEndian::Load16(cur_src);
2155       cur_dest[0] = base64[in >> 10];
2156       in &= 0x3FF;
2157       cur_dest[1] = base64[in >> 4];
2158       in &= 0x00F;
2159       cur_dest[2] = base64[in << 2];
2160       cur_dest += 3;
2161       if (do_padding) {
2162         if ((szdest -= 1) < 0) return 0;
2163         cur_dest[0] = kPad64;
2164         cur_dest += 1;
2165       }
2166       break;
2167     }
2168     case 3: {
2169       // Three bytes left: same as in the big loop above.  We can't do this in
2170       // the loop because the loop above always reads 4 bytes, and the fourth
2171       // byte is past the end of the input.
2172       if ((szdest -= 4) < 0) return 0;
2173       uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2174       cur_dest[0] = base64[in >> 18];
2175       in &= 0x3FFFF;
2176       cur_dest[1] = base64[in >> 12];
2177       in &= 0xFFF;
2178       cur_dest[2] = base64[in >> 6];
2179       in &= 0x3F;
2180       cur_dest[3] = base64[in];
2181       cur_dest += 4;
2182       break;
2183     }
2184     default:
2185       // Should not be reached: blocks of 4 bytes are handled
2186       // in the while loop before this switch statement.
2187       GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2188       break;
2189   }
2190   return (cur_dest - dest);
2191 }
2192 
2193 static const char kBase64Chars[] =
2194 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2195 
2196 static const char kWebSafeBase64Chars[] =
2197 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2198 
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2199 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2200   return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2201 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2202 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2203                         int szdest, bool do_padding) {
2204   return Base64EscapeInternal(src, szsrc, dest, szdest,
2205                               kWebSafeBase64Chars, do_padding);
2206 }
2207 
Base64EscapeInternal(const unsigned char * src,int szsrc,string * dest,bool do_padding,const char * base64_chars)2208 void Base64EscapeInternal(const unsigned char* src, int szsrc,
2209                           string* dest, bool do_padding,
2210                           const char* base64_chars) {
2211   const int calc_escaped_size =
2212     CalculateBase64EscapedLen(szsrc, do_padding);
2213   dest->resize(calc_escaped_size);
2214   const int escaped_len = Base64EscapeInternal(src, szsrc,
2215                                                string_as_array(dest),
2216                                                dest->size(),
2217                                                base64_chars,
2218                                                do_padding);
2219   GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2220   dest->erase(escaped_len);
2221 }
2222 
Base64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2223 void Base64Escape(const unsigned char *src, int szsrc,
2224                   string* dest, bool do_padding) {
2225   Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2226 }
2227 
WebSafeBase64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2228 void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2229                          string *dest, bool do_padding) {
2230   Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2231 }
2232 
Base64Escape(StringPiece src,string * dest)2233 void Base64Escape(StringPiece src, string* dest) {
2234   Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2235                src.size(), dest, true);
2236 }
2237 
WebSafeBase64Escape(StringPiece src,string * dest)2238 void WebSafeBase64Escape(StringPiece src, string* dest) {
2239   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2240                       src.size(), dest, false);
2241 }
2242 
WebSafeBase64EscapeWithPadding(StringPiece src,string * dest)2243 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2244   WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2245                       src.size(), dest, true);
2246 }
2247 
2248 // Helper to append a Unicode code point to a string as UTF8, without bringing
2249 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2250 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2251   uint32 tmp = 0;
2252   int len = 0;
2253   if (code_point <= 0x7f) {
2254     tmp = code_point;
2255     len = 1;
2256   } else if (code_point <= 0x07ff) {
2257     tmp = 0x0000c080 |
2258         ((code_point & 0x07c0) << 2) |
2259         (code_point & 0x003f);
2260     len = 2;
2261   } else if (code_point <= 0xffff) {
2262     tmp = 0x00e08080 |
2263         ((code_point & 0xf000) << 4) |
2264         ((code_point & 0x0fc0) << 2) |
2265         (code_point & 0x003f);
2266     len = 3;
2267   } else {
2268     // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2269     // normally only defined up to there as well.
2270     tmp = 0xf0808080 |
2271         ((code_point & 0x1c0000) << 6) |
2272         ((code_point & 0x03f000) << 4) |
2273         ((code_point & 0x000fc0) << 2) |
2274         (code_point & 0x003f);
2275     len = 4;
2276   }
2277   tmp = ghtonl(tmp);
2278   memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2279   return len;
2280 }
2281 
2282 // Table of UTF-8 character lengths, based on first byte
2283 static const unsigned char kUTF8LenTbl[256] = {
2284     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2285     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2286     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2287     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2288     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2289     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2290 
2291     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2292     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2293     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2294     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2295     2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2296     3, 3, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
2297 
2298 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2299 int UTF8FirstLetterNumBytes(const char* src, int len) {
2300   if (len == 0) {
2301     return 0;
2302   }
2303   return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2304 }
2305 
2306 // ----------------------------------------------------------------------
2307 // CleanStringLineEndings()
2308 //   Clean up a multi-line string to conform to Unix line endings.
2309 //   Reads from src and appends to dst, so usually dst should be empty.
2310 //
2311 //   If there is no line ending at the end of a non-empty string, it can
2312 //   be added automatically.
2313 //
2314 //   Four different types of input are correctly handled:
2315 //
2316 //     - Unix/Linux files: line ending is LF: pass through unchanged
2317 //
2318 //     - DOS/Windows files: line ending is CRLF: convert to LF
2319 //
2320 //     - Legacy Mac files: line ending is CR: convert to LF
2321 //
2322 //     - Garbled files: random line endings: convert gracefully
2323 //                      lonely CR, lonely LF, CRLF: convert to LF
2324 //
2325 //   @param src The multi-line string to convert
2326 //   @param dst The converted string is appended to this string
2327 //   @param auto_end_last_line Automatically terminate the last line
2328 //
2329 //   Limitations:
2330 //
2331 //     This does not do the right thing for CRCRLF files created by
2332 //     broken programs that do another Unix->DOS conversion on files
2333 //     that are already in CRLF format.  For this, a two-pass approach
2334 //     brute-force would be needed that
2335 //
2336 //       (1) determines the presence of LF (first one is ok)
2337 //       (2) if yes, removes any CR, else convert every CR to LF
2338 
CleanStringLineEndings(const string & src,string * dst,bool auto_end_last_line)2339 void CleanStringLineEndings(const string &src, string *dst,
2340                             bool auto_end_last_line) {
2341   if (dst->empty()) {
2342     dst->append(src);
2343     CleanStringLineEndings(dst, auto_end_last_line);
2344   } else {
2345     string tmp = src;
2346     CleanStringLineEndings(&tmp, auto_end_last_line);
2347     dst->append(tmp);
2348   }
2349 }
2350 
CleanStringLineEndings(string * str,bool auto_end_last_line)2351 void CleanStringLineEndings(string *str, bool auto_end_last_line) {
2352   ptrdiff_t output_pos = 0;
2353   bool r_seen = false;
2354   ptrdiff_t len = str->size();
2355 
2356   char *p = &(*str)[0];
2357 
2358   for (ptrdiff_t input_pos = 0; input_pos < len;) {
2359     if (!r_seen && input_pos + 8 < len) {
2360       uint64_t v = GOOGLE_UNALIGNED_LOAD64(p + input_pos);
2361       // Loop over groups of 8 bytes at a time until we come across
2362       // a word that has a byte whose value is less than or equal to
2363       // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
2364       //
2365       // We use a has_less macro that quickly tests a whole 64-bit
2366       // word to see if any of the bytes has a value < N.
2367       //
2368       // For more details, see:
2369       //   http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
2370 #define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128)
2371       if (!has_less(v, '\r' + 1)) {
2372 #undef has_less
2373         // No byte in this word has a value that could be a \r or a \n
2374         if (output_pos != input_pos) {
2375           GOOGLE_UNALIGNED_STORE64(p + output_pos, v);
2376         }
2377         input_pos += 8;
2378         output_pos += 8;
2379         continue;
2380       }
2381     }
2382     string::const_reference in = p[input_pos];
2383     if (in == '\r') {
2384       if (r_seen) p[output_pos++] = '\n';
2385       r_seen = true;
2386     } else if (in == '\n') {
2387       if (input_pos != output_pos)
2388         p[output_pos++] = '\n';
2389       else
2390         output_pos++;
2391       r_seen = false;
2392     } else {
2393       if (r_seen) p[output_pos++] = '\n';
2394       r_seen = false;
2395       if (input_pos != output_pos)
2396         p[output_pos++] = in;
2397       else
2398         output_pos++;
2399     }
2400     input_pos++;
2401   }
2402   if (r_seen ||
2403       (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) {
2404     str->resize(output_pos + 1);
2405     str->operator[](output_pos) = '\n';
2406   } else if (output_pos < len) {
2407     str->resize(output_pos);
2408   }
2409 }
2410 
2411 namespace internal {
2412 
2413 // ----------------------------------------------------------------------
2414 // NoLocaleStrtod()
2415 //   This code will make you cry.
2416 // ----------------------------------------------------------------------
2417 
2418 namespace {
2419 
2420 // Returns a string identical to *input except that the character pointed to
2421 // by radix_pos (which should be '.') is replaced with the locale-specific
2422 // radix character.
LocalizeRadix(const char * input,const char * radix_pos)2423 std::string LocalizeRadix(const char *input, const char *radix_pos) {
2424   // Determine the locale-specific radix character by calling sprintf() to
2425   // print the number 1.5, then stripping off the digits.  As far as I can
2426   // tell, this is the only portable, thread-safe way to get the C library
2427   // to divuldge the locale's radix character.  No, localeconv() is NOT
2428   // thread-safe.
2429   char temp[16];
2430   int size = snprintf(temp, sizeof(temp), "%.1f", 1.5);
2431   GOOGLE_CHECK_EQ(temp[0], '1');
2432   GOOGLE_CHECK_EQ(temp[size - 1], '5');
2433   GOOGLE_CHECK_LE(size, 6);
2434 
2435   // Now replace the '.' in the input with it.
2436   std::string result;
2437   result.reserve(strlen(input) + size - 3);
2438   result.append(input, radix_pos);
2439   result.append(temp + 1, size - 2);
2440   result.append(radix_pos + 1);
2441   return result;
2442 }
2443 
2444 }  // namespace
2445 
NoLocaleStrtod(const char * str,char ** endptr)2446 double NoLocaleStrtod(const char *str, char **endptr) {
2447   // We cannot simply set the locale to "C" temporarily with setlocale()
2448   // as this is not thread-safe.  Instead, we try to parse in the current
2449   // locale first.  If parsing stops at a '.' character, then this is a
2450   // pretty good hint that we're actually in some other locale in which
2451   // '.' is not the radix character.
2452 
2453   char *temp_endptr;
2454   double result = strtod(str, &temp_endptr);
2455   if (endptr != NULL) *endptr = temp_endptr;
2456   if (*temp_endptr != '.') return result;
2457 
2458   // Parsing halted on a '.'.  Perhaps we're in a different locale?  Let's
2459   // try to replace the '.' with a locale-specific radix character and
2460   // try again.
2461   std::string localized = LocalizeRadix(str, temp_endptr);
2462   const char *localized_cstr = localized.c_str();
2463   char *localized_endptr;
2464   result = strtod(localized_cstr, &localized_endptr);
2465   if ((localized_endptr - localized_cstr) > (temp_endptr - str)) {
2466     // This attempt got further, so replacing the decimal must have helped.
2467     // Update endptr to point at the right location.
2468     if (endptr != NULL) {
2469       // size_diff is non-zero if the localized radix has multiple bytes.
2470       int size_diff = localized.size() - strlen(str);
2471       // const_cast is necessary to match the strtod() interface.
2472       *endptr = const_cast<char *>(
2473           str + (localized_endptr - localized_cstr - size_diff));
2474     }
2475   }
2476 
2477   return result;
2478 }
2479 
2480 }  // namespace internal
2481 
2482 }  // namespace protobuf
2483 }  // namespace google
2484