1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 // from google3/strings/strutil.cc
32
33 #include <google/protobuf/stubs/strutil.h>
34
35 #include <errno.h>
36 #include <float.h> // FLT_DIG and DBL_DIG
37 #include <limits.h>
38 #include <stdio.h>
39 #include <cmath>
40 #include <iterator>
41 #include <limits>
42
43 #include <google/protobuf/stubs/logging.h>
44 #include <google/protobuf/stubs/stl_util.h>
45 #include <google/protobuf/io/strtod.h>
46
47 #ifdef _WIN32
48 // MSVC has only _snprintf, not snprintf.
49 //
50 // MinGW has both snprintf and _snprintf, but they appear to be different
51 // functions. The former is buggy. When invoked like so:
52 // char buffer[32];
53 // snprintf(buffer, 32, "%.*g\n", FLT_DIG, 1.23e10f);
54 // it prints "1.23000e+10". This is plainly wrong: %g should never print
55 // trailing zeros after the decimal point. For some reason this bug only
56 // occurs with some input values, not all. In any case, _snprintf does the
57 // right thing, so we use it.
58 #define snprintf _snprintf
59 #endif
60
61 namespace google {
62 namespace protobuf {
63
64 // These are defined as macros on some platforms. #undef them so that we can
65 // redefine them.
66 #undef isxdigit
67 #undef isprint
68
69 // The definitions of these in ctype.h change based on locale. Since our
70 // string manipulation is all in relation to the protocol buffer and C++
71 // languages, we always want to use the C locale. So, we re-define these
72 // exactly as we want them.
isxdigit(char c)73 inline bool isxdigit(char c) {
74 return ('0' <= c && c <= '9') ||
75 ('a' <= c && c <= 'f') ||
76 ('A' <= c && c <= 'F');
77 }
78
isprint(char c)79 inline bool isprint(char c) {
80 return c >= 0x20 && c <= 0x7E;
81 }
82
83 // ----------------------------------------------------------------------
84 // StripString
85 // Replaces any occurrence of the character 'remove' (or the characters
86 // in 'remove') with the character 'replacewith'.
87 // ----------------------------------------------------------------------
StripString(string * s,const char * remove,char replacewith)88 void StripString(string* s, const char* remove, char replacewith) {
89 const char * str_start = s->c_str();
90 const char * str = str_start;
91 for (str = strpbrk(str, remove);
92 str != nullptr;
93 str = strpbrk(str + 1, remove)) {
94 (*s)[str - str_start] = replacewith;
95 }
96 }
97
98 // ----------------------------------------------------------------------
99 // ReplaceCharacters
100 // Replaces any occurrence of the character 'remove' (or the characters
101 // in 'remove') with the character 'replacewith'.
102 // ----------------------------------------------------------------------
ReplaceCharacters(string * s,const char * remove,char replacewith)103 void ReplaceCharacters(string *s, const char *remove, char replacewith) {
104 const char *str_start = s->c_str();
105 const char *str = str_start;
106 for (str = strpbrk(str, remove);
107 str != nullptr;
108 str = strpbrk(str + 1, remove)) {
109 (*s)[str - str_start] = replacewith;
110 }
111 }
112
StripWhitespace(string * str)113 void StripWhitespace(string* str) {
114 int str_length = str->length();
115
116 // Strip off leading whitespace.
117 int first = 0;
118 while (first < str_length && ascii_isspace(str->at(first))) {
119 ++first;
120 }
121 // If entire string is white space.
122 if (first == str_length) {
123 str->clear();
124 return;
125 }
126 if (first > 0) {
127 str->erase(0, first);
128 str_length -= first;
129 }
130
131 // Strip off trailing whitespace.
132 int last = str_length - 1;
133 while (last >= 0 && ascii_isspace(str->at(last))) {
134 --last;
135 }
136 if (last != (str_length - 1) && last >= 0) {
137 str->erase(last + 1, string::npos);
138 }
139 }
140
141 // ----------------------------------------------------------------------
142 // StringReplace()
143 // Replace the "old" pattern with the "new" pattern in a string,
144 // and append the result to "res". If replace_all is false,
145 // it only replaces the first instance of "old."
146 // ----------------------------------------------------------------------
147
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all,string * res)148 void StringReplace(const string& s, const string& oldsub,
149 const string& newsub, bool replace_all,
150 string* res) {
151 if (oldsub.empty()) {
152 res->append(s); // if empty, append the given string.
153 return;
154 }
155
156 string::size_type start_pos = 0;
157 string::size_type pos;
158 do {
159 pos = s.find(oldsub, start_pos);
160 if (pos == string::npos) {
161 break;
162 }
163 res->append(s, start_pos, pos - start_pos);
164 res->append(newsub);
165 start_pos = pos + oldsub.size(); // start searching again after the "old"
166 } while (replace_all);
167 res->append(s, start_pos, s.length() - start_pos);
168 }
169
170 // ----------------------------------------------------------------------
171 // StringReplace()
172 // Give me a string and two patterns "old" and "new", and I replace
173 // the first instance of "old" in the string with "new", if it
174 // exists. If "global" is true; call this repeatedly until it
175 // fails. RETURN a new string, regardless of whether the replacement
176 // happened or not.
177 // ----------------------------------------------------------------------
178
StringReplace(const string & s,const string & oldsub,const string & newsub,bool replace_all)179 string StringReplace(const string& s, const string& oldsub,
180 const string& newsub, bool replace_all) {
181 string ret;
182 StringReplace(s, oldsub, newsub, replace_all, &ret);
183 return ret;
184 }
185
186 // ----------------------------------------------------------------------
187 // SplitStringUsing()
188 // Split a string using a character delimiter. Append the components
189 // to 'result'.
190 //
191 // Note: For multi-character delimiters, this routine will split on *ANY* of
192 // the characters in the string, not the entire string as a single delimiter.
193 // ----------------------------------------------------------------------
194 template <typename ITR>
195 static inline
SplitStringToIteratorUsing(const string & full,const char * delim,ITR & result)196 void SplitStringToIteratorUsing(const string& full,
197 const char* delim,
198 ITR& result) {
199 // Optimize the common case where delim is a single character.
200 if (delim[0] != '\0' && delim[1] == '\0') {
201 char c = delim[0];
202 const char* p = full.data();
203 const char* end = p + full.size();
204 while (p != end) {
205 if (*p == c) {
206 ++p;
207 } else {
208 const char* start = p;
209 while (++p != end && *p != c);
210 *result++ = string(start, p - start);
211 }
212 }
213 return;
214 }
215
216 string::size_type begin_index, end_index;
217 begin_index = full.find_first_not_of(delim);
218 while (begin_index != string::npos) {
219 end_index = full.find_first_of(delim, begin_index);
220 if (end_index == string::npos) {
221 *result++ = full.substr(begin_index);
222 return;
223 }
224 *result++ = full.substr(begin_index, (end_index - begin_index));
225 begin_index = full.find_first_not_of(delim, end_index);
226 }
227 }
228
SplitStringUsing(const string & full,const char * delim,std::vector<string> * result)229 void SplitStringUsing(const string& full,
230 const char* delim,
231 std::vector<string>* result) {
232 std::back_insert_iterator< std::vector<string> > it(*result);
233 SplitStringToIteratorUsing(full, delim, it);
234 }
235
236 // Split a string using a character delimiter. Append the components
237 // to 'result'. If there are consecutive delimiters, this function
238 // will return corresponding empty strings. The string is split into
239 // at most the specified number of pieces greedily. This means that the
240 // last piece may possibly be split further. To split into as many pieces
241 // as possible, specify 0 as the number of pieces.
242 //
243 // If "full" is the empty string, yields an empty string as the only value.
244 //
245 // If "pieces" is negative for some reason, it returns the whole string
246 // ----------------------------------------------------------------------
247 template <typename StringType, typename ITR>
248 static inline
SplitStringToIteratorAllowEmpty(const StringType & full,const char * delim,int pieces,ITR & result)249 void SplitStringToIteratorAllowEmpty(const StringType& full,
250 const char* delim,
251 int pieces,
252 ITR& result) {
253 string::size_type begin_index, end_index;
254 begin_index = 0;
255
256 for (int i = 0; (i < pieces-1) || (pieces == 0); i++) {
257 end_index = full.find_first_of(delim, begin_index);
258 if (end_index == string::npos) {
259 *result++ = full.substr(begin_index);
260 return;
261 }
262 *result++ = full.substr(begin_index, (end_index - begin_index));
263 begin_index = end_index + 1;
264 }
265 *result++ = full.substr(begin_index);
266 }
267
SplitStringAllowEmpty(const string & full,const char * delim,std::vector<string> * result)268 void SplitStringAllowEmpty(const string& full, const char* delim,
269 std::vector<string>* result) {
270 std::back_insert_iterator<std::vector<string> > it(*result);
271 SplitStringToIteratorAllowEmpty(full, delim, 0, it);
272 }
273
274 // ----------------------------------------------------------------------
275 // JoinStrings()
276 // This merges a vector of string components with delim inserted
277 // as separaters between components.
278 //
279 // ----------------------------------------------------------------------
280 template <class ITERATOR>
JoinStringsIterator(const ITERATOR & start,const ITERATOR & end,const char * delim,string * result)281 static void JoinStringsIterator(const ITERATOR& start,
282 const ITERATOR& end,
283 const char* delim,
284 string* result) {
285 GOOGLE_CHECK(result != nullptr);
286 result->clear();
287 int delim_length = strlen(delim);
288
289 // Precompute resulting length so we can reserve() memory in one shot.
290 int length = 0;
291 for (ITERATOR iter = start; iter != end; ++iter) {
292 if (iter != start) {
293 length += delim_length;
294 }
295 length += iter->size();
296 }
297 result->reserve(length);
298
299 // Now combine everything.
300 for (ITERATOR iter = start; iter != end; ++iter) {
301 if (iter != start) {
302 result->append(delim, delim_length);
303 }
304 result->append(iter->data(), iter->size());
305 }
306 }
307
JoinStrings(const std::vector<string> & components,const char * delim,string * result)308 void JoinStrings(const std::vector<string>& components,
309 const char* delim,
310 string * result) {
311 JoinStringsIterator(components.begin(), components.end(), delim, result);
312 }
313
314 // ----------------------------------------------------------------------
315 // UnescapeCEscapeSequences()
316 // This does all the unescaping that C does: \ooo, \r, \n, etc
317 // Returns length of resulting string.
318 // The implementation of \x parses any positive number of hex digits,
319 // but it is an error if the value requires more than 8 bits, and the
320 // result is truncated to 8 bits.
321 //
322 // The second call stores its errors in a supplied string vector.
323 // If the string vector pointer is nullptr, it reports the errors with LOG().
324 // ----------------------------------------------------------------------
325
326 #define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
327
328 // Protocol buffers doesn't ever care about errors, but I don't want to remove
329 // the code.
330 #define LOG_STRING(LEVEL, VECTOR) GOOGLE_LOG_IF(LEVEL, false)
331
UnescapeCEscapeSequences(const char * source,char * dest)332 int UnescapeCEscapeSequences(const char* source, char* dest) {
333 return UnescapeCEscapeSequences(source, dest, nullptr);
334 }
335
UnescapeCEscapeSequences(const char * source,char * dest,std::vector<string> * errors)336 int UnescapeCEscapeSequences(const char* source, char* dest,
337 std::vector<string> *errors) {
338 GOOGLE_DCHECK(errors == nullptr) << "Error reporting not implemented.";
339
340 char* d = dest;
341 const char* p = source;
342
343 // Small optimization for case where source = dest and there's no escaping
344 while ( p == d && *p != '\0' && *p != '\\' )
345 p++, d++;
346
347 while (*p != '\0') {
348 if (*p != '\\') {
349 *d++ = *p++;
350 } else {
351 switch ( *++p ) { // skip past the '\\'
352 case '\0':
353 LOG_STRING(ERROR, errors) << "String cannot end with \\";
354 *d = '\0';
355 return d - dest; // we're done with p
356 case 'a': *d++ = '\a'; break;
357 case 'b': *d++ = '\b'; break;
358 case 'f': *d++ = '\f'; break;
359 case 'n': *d++ = '\n'; break;
360 case 'r': *d++ = '\r'; break;
361 case 't': *d++ = '\t'; break;
362 case 'v': *d++ = '\v'; break;
363 case '\\': *d++ = '\\'; break;
364 case '?': *d++ = '\?'; break; // \? Who knew?
365 case '\'': *d++ = '\''; break;
366 case '"': *d++ = '\"'; break;
367 case '0': case '1': case '2': case '3': // octal digit: 1 to 3 digits
368 case '4': case '5': case '6': case '7': {
369 char ch = *p - '0';
370 if ( IS_OCTAL_DIGIT(p[1]) )
371 ch = ch * 8 + *++p - '0';
372 if ( IS_OCTAL_DIGIT(p[1]) ) // safe (and easy) to do this twice
373 ch = ch * 8 + *++p - '0'; // now points at last digit
374 *d++ = ch;
375 break;
376 }
377 case 'x': case 'X': {
378 if (!isxdigit(p[1])) {
379 if (p[1] == '\0') {
380 LOG_STRING(ERROR, errors) << "String cannot end with \\x";
381 } else {
382 LOG_STRING(ERROR, errors) <<
383 "\\x cannot be followed by non-hex digit: \\" << *p << p[1];
384 }
385 break;
386 }
387 unsigned int ch = 0;
388 const char *hex_start = p;
389 while (isxdigit(p[1])) // arbitrarily many hex digits
390 ch = (ch << 4) + hex_digit_to_int(*++p);
391 if (ch > 0xFF)
392 LOG_STRING(ERROR, errors) << "Value of " <<
393 "\\" << string(hex_start, p+1-hex_start) << " exceeds 8 bits";
394 *d++ = ch;
395 break;
396 }
397 #if 0 // TODO(kenton): Support \u and \U? Requires runetochar().
398 case 'u': {
399 // \uhhhh => convert 4 hex digits to UTF-8
400 char32 rune = 0;
401 const char *hex_start = p;
402 for (int i = 0; i < 4; ++i) {
403 if (isxdigit(p[1])) { // Look one char ahead.
404 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
405 } else {
406 LOG_STRING(ERROR, errors)
407 << "\\u must be followed by 4 hex digits: \\"
408 << string(hex_start, p+1-hex_start);
409 break;
410 }
411 }
412 d += runetochar(d, &rune);
413 break;
414 }
415 case 'U': {
416 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
417 char32 rune = 0;
418 const char *hex_start = p;
419 for (int i = 0; i < 8; ++i) {
420 if (isxdigit(p[1])) { // Look one char ahead.
421 // Don't change rune until we're sure this
422 // is within the Unicode limit, but do advance p.
423 char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
424 if (newrune > 0x10FFFF) {
425 LOG_STRING(ERROR, errors)
426 << "Value of \\"
427 << string(hex_start, p + 1 - hex_start)
428 << " exceeds Unicode limit (0x10FFFF)";
429 break;
430 } else {
431 rune = newrune;
432 }
433 } else {
434 LOG_STRING(ERROR, errors)
435 << "\\U must be followed by 8 hex digits: \\"
436 << string(hex_start, p+1-hex_start);
437 break;
438 }
439 }
440 d += runetochar(d, &rune);
441 break;
442 }
443 #endif
444 default:
445 LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
446 }
447 p++; // read past letter we escaped
448 }
449 }
450 *d = '\0';
451 return d - dest;
452 }
453
454 // ----------------------------------------------------------------------
455 // UnescapeCEscapeString()
456 // This does the same thing as UnescapeCEscapeSequences, but creates
457 // a new string. The caller does not need to worry about allocating
458 // a dest buffer. This should be used for non performance critical
459 // tasks such as printing debug messages. It is safe for src and dest
460 // to be the same.
461 //
462 // The second call stores its errors in a supplied string vector.
463 // If the string vector pointer is nullptr, it reports the errors with LOG().
464 //
465 // In the first and second calls, the length of dest is returned. In the
466 // the third call, the new string is returned.
467 // ----------------------------------------------------------------------
UnescapeCEscapeString(const string & src,string * dest)468 int UnescapeCEscapeString(const string& src, string* dest) {
469 return UnescapeCEscapeString(src, dest, nullptr);
470 }
471
UnescapeCEscapeString(const string & src,string * dest,std::vector<string> * errors)472 int UnescapeCEscapeString(const string& src, string* dest,
473 std::vector<string> *errors) {
474 std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
475 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), errors);
476 GOOGLE_CHECK(dest);
477 dest->assign(unescaped.get(), len);
478 return len;
479 }
480
UnescapeCEscapeString(const string & src)481 string UnescapeCEscapeString(const string& src) {
482 std::unique_ptr<char[]> unescaped(new char[src.size() + 1]);
483 int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
484 return string(unescaped.get(), len);
485 }
486
487 // ----------------------------------------------------------------------
488 // CEscapeString()
489 // CHexEscapeString()
490 // Copies 'src' to 'dest', escaping dangerous characters using
491 // C-style escape sequences. This is very useful for preparing query
492 // flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
493 // hexadecimal rather than octal sequences.
494 // Returns the number of bytes written to 'dest' (not including the \0)
495 // or -1 if there was insufficient space.
496 //
497 // Currently only \n, \r, \t, ", ', \ and !isprint() chars are escaped.
498 // ----------------------------------------------------------------------
CEscapeInternal(const char * src,int src_len,char * dest,int dest_len,bool use_hex,bool utf8_safe)499 int CEscapeInternal(const char* src, int src_len, char* dest,
500 int dest_len, bool use_hex, bool utf8_safe) {
501 const char* src_end = src + src_len;
502 int used = 0;
503 bool last_hex_escape = false; // true if last output char was \xNN
504
505 for (; src < src_end; src++) {
506 if (dest_len - used < 2) // Need space for two letter escape
507 return -1;
508
509 bool is_hex_escape = false;
510 switch (*src) {
511 case '\n': dest[used++] = '\\'; dest[used++] = 'n'; break;
512 case '\r': dest[used++] = '\\'; dest[used++] = 'r'; break;
513 case '\t': dest[used++] = '\\'; dest[used++] = 't'; break;
514 case '\"': dest[used++] = '\\'; dest[used++] = '\"'; break;
515 case '\'': dest[used++] = '\\'; dest[used++] = '\''; break;
516 case '\\': dest[used++] = '\\'; dest[used++] = '\\'; break;
517 default:
518 // Note that if we emit \xNN and the src character after that is a hex
519 // digit then that digit must be escaped too to prevent it being
520 // interpreted as part of the character code by C.
521 if ((!utf8_safe || static_cast<uint8>(*src) < 0x80) &&
522 (!isprint(*src) ||
523 (last_hex_escape && isxdigit(*src)))) {
524 if (dest_len - used < 4) // need space for 4 letter escape
525 return -1;
526 sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"),
527 static_cast<uint8>(*src));
528 is_hex_escape = use_hex;
529 used += 4;
530 } else {
531 dest[used++] = *src; break;
532 }
533 }
534 last_hex_escape = is_hex_escape;
535 }
536
537 if (dest_len - used < 1) // make sure that there is room for \0
538 return -1;
539
540 dest[used] = '\0'; // doesn't count towards return value though
541 return used;
542 }
543
544 // Calculates the length of the C-style escaped version of 'src'.
545 // Assumes that non-printable characters are escaped using octal sequences, and
546 // that UTF-8 bytes are not handled specially.
CEscapedLength(StringPiece src)547 static inline size_t CEscapedLength(StringPiece src) {
548 static char c_escaped_len[256] = {
549 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
550 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
551 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
552 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
553 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
554 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
555 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
556 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
557 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
558 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
559 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
560 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
561 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
562 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
563 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
564 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
565 };
566
567 size_t escaped_len = 0;
568 for (int i = 0; i < src.size(); ++i) {
569 unsigned char c = static_cast<unsigned char>(src[i]);
570 escaped_len += c_escaped_len[c];
571 }
572 return escaped_len;
573 }
574
575 // ----------------------------------------------------------------------
576 // Escapes 'src' using C-style escape sequences, and appends the escaped string
577 // to 'dest'. This version is faster than calling CEscapeInternal as it computes
578 // the required space using a lookup table, and also does not do any special
579 // handling for Hex or UTF-8 characters.
580 // ----------------------------------------------------------------------
CEscapeAndAppend(StringPiece src,string * dest)581 void CEscapeAndAppend(StringPiece src, string* dest) {
582 size_t escaped_len = CEscapedLength(src);
583 if (escaped_len == src.size()) {
584 dest->append(src.data(), src.size());
585 return;
586 }
587
588 size_t cur_dest_len = dest->size();
589 dest->resize(cur_dest_len + escaped_len);
590 char* append_ptr = &(*dest)[cur_dest_len];
591
592 for (int i = 0; i < src.size(); ++i) {
593 unsigned char c = static_cast<unsigned char>(src[i]);
594 switch (c) {
595 case '\n': *append_ptr++ = '\\'; *append_ptr++ = 'n'; break;
596 case '\r': *append_ptr++ = '\\'; *append_ptr++ = 'r'; break;
597 case '\t': *append_ptr++ = '\\'; *append_ptr++ = 't'; break;
598 case '\"': *append_ptr++ = '\\'; *append_ptr++ = '\"'; break;
599 case '\'': *append_ptr++ = '\\'; *append_ptr++ = '\''; break;
600 case '\\': *append_ptr++ = '\\'; *append_ptr++ = '\\'; break;
601 default:
602 if (!isprint(c)) {
603 *append_ptr++ = '\\';
604 *append_ptr++ = '0' + c / 64;
605 *append_ptr++ = '0' + (c % 64) / 8;
606 *append_ptr++ = '0' + c % 8;
607 } else {
608 *append_ptr++ = c;
609 }
610 break;
611 }
612 }
613 }
614
CEscape(const string & src)615 string CEscape(const string& src) {
616 string dest;
617 CEscapeAndAppend(src, &dest);
618 return dest;
619 }
620
621 namespace strings {
622
Utf8SafeCEscape(const string & src)623 string Utf8SafeCEscape(const string& src) {
624 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
625 std::unique_ptr<char[]> dest(new char[dest_length]);
626 const int len = CEscapeInternal(src.data(), src.size(),
627 dest.get(), dest_length, false, true);
628 GOOGLE_DCHECK_GE(len, 0);
629 return string(dest.get(), len);
630 }
631
CHexEscape(const string & src)632 string CHexEscape(const string& src) {
633 const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
634 std::unique_ptr<char[]> dest(new char[dest_length]);
635 const int len = CEscapeInternal(src.data(), src.size(),
636 dest.get(), dest_length, true, false);
637 GOOGLE_DCHECK_GE(len, 0);
638 return string(dest.get(), len);
639 }
640
641 } // namespace strings
642
643 // ----------------------------------------------------------------------
644 // strto32_adaptor()
645 // strtou32_adaptor()
646 // Implementation of strto[u]l replacements that have identical
647 // overflow and underflow characteristics for both ILP-32 and LP-64
648 // platforms, including errno preservation in error-free calls.
649 // ----------------------------------------------------------------------
650
strto32_adaptor(const char * nptr,char ** endptr,int base)651 int32 strto32_adaptor(const char *nptr, char **endptr, int base) {
652 const int saved_errno = errno;
653 errno = 0;
654 const long result = strtol(nptr, endptr, base);
655 if (errno == ERANGE && result == LONG_MIN) {
656 return kint32min;
657 } else if (errno == ERANGE && result == LONG_MAX) {
658 return kint32max;
659 } else if (errno == 0 && result < kint32min) {
660 errno = ERANGE;
661 return kint32min;
662 } else if (errno == 0 && result > kint32max) {
663 errno = ERANGE;
664 return kint32max;
665 }
666 if (errno == 0)
667 errno = saved_errno;
668 return static_cast<int32>(result);
669 }
670
strtou32_adaptor(const char * nptr,char ** endptr,int base)671 uint32 strtou32_adaptor(const char *nptr, char **endptr, int base) {
672 const int saved_errno = errno;
673 errno = 0;
674 const unsigned long result = strtoul(nptr, endptr, base);
675 if (errno == ERANGE && result == ULONG_MAX) {
676 return kuint32max;
677 } else if (errno == 0 && result > kuint32max) {
678 errno = ERANGE;
679 return kuint32max;
680 }
681 if (errno == 0)
682 errno = saved_errno;
683 return static_cast<uint32>(result);
684 }
685
safe_parse_sign(string * text,bool * negative_ptr)686 inline bool safe_parse_sign(string* text /*inout*/,
687 bool* negative_ptr /*output*/) {
688 const char* start = text->data();
689 const char* end = start + text->size();
690
691 // Consume whitespace.
692 while (start < end && (start[0] == ' ')) {
693 ++start;
694 }
695 while (start < end && (end[-1] == ' ')) {
696 --end;
697 }
698 if (start >= end) {
699 return false;
700 }
701
702 // Consume sign.
703 *negative_ptr = (start[0] == '-');
704 if (*negative_ptr || start[0] == '+') {
705 ++start;
706 if (start >= end) {
707 return false;
708 }
709 }
710 *text = text->substr(start - text->data(), end - start);
711 return true;
712 }
713
714 template<typename IntType>
safe_parse_positive_int(string text,IntType * value_p)715 bool safe_parse_positive_int(
716 string text, IntType* value_p) {
717 int base = 10;
718 IntType value = 0;
719 const IntType vmax = std::numeric_limits<IntType>::max();
720 assert(vmax > 0);
721 assert(vmax >= base);
722 const IntType vmax_over_base = vmax / base;
723 const char* start = text.data();
724 const char* end = start + text.size();
725 // loop over digits
726 for (; start < end; ++start) {
727 unsigned char c = static_cast<unsigned char>(start[0]);
728 int digit = c - '0';
729 if (digit >= base || digit < 0) {
730 *value_p = value;
731 return false;
732 }
733 if (value > vmax_over_base) {
734 *value_p = vmax;
735 return false;
736 }
737 value *= base;
738 if (value > vmax - digit) {
739 *value_p = vmax;
740 return false;
741 }
742 value += digit;
743 }
744 *value_p = value;
745 return true;
746 }
747
748 template<typename IntType>
safe_parse_negative_int(const string & text,IntType * value_p)749 bool safe_parse_negative_int(
750 const string& text, IntType* value_p) {
751 int base = 10;
752 IntType value = 0;
753 const IntType vmin = std::numeric_limits<IntType>::min();
754 assert(vmin < 0);
755 assert(vmin <= 0 - base);
756 IntType vmin_over_base = vmin / base;
757 // 2003 c++ standard [expr.mul]
758 // "... the sign of the remainder is implementation-defined."
759 // Although (vmin/base)*base + vmin%base is always vmin.
760 // 2011 c++ standard tightens the spec but we cannot rely on it.
761 if (vmin % base > 0) {
762 vmin_over_base += 1;
763 }
764 const char* start = text.data();
765 const char* end = start + text.size();
766 // loop over digits
767 for (; start < end; ++start) {
768 unsigned char c = static_cast<unsigned char>(start[0]);
769 int digit = c - '0';
770 if (digit >= base || digit < 0) {
771 *value_p = value;
772 return false;
773 }
774 if (value < vmin_over_base) {
775 *value_p = vmin;
776 return false;
777 }
778 value *= base;
779 if (value < vmin + digit) {
780 *value_p = vmin;
781 return false;
782 }
783 value -= digit;
784 }
785 *value_p = value;
786 return true;
787 }
788
789 template<typename IntType>
safe_int_internal(string text,IntType * value_p)790 bool safe_int_internal(string text, IntType* value_p) {
791 *value_p = 0;
792 bool negative;
793 if (!safe_parse_sign(&text, &negative)) {
794 return false;
795 }
796 if (!negative) {
797 return safe_parse_positive_int(text, value_p);
798 } else {
799 return safe_parse_negative_int(text, value_p);
800 }
801 }
802
803 template<typename IntType>
safe_uint_internal(string text,IntType * value_p)804 bool safe_uint_internal(string text, IntType* value_p) {
805 *value_p = 0;
806 bool negative;
807 if (!safe_parse_sign(&text, &negative) || negative) {
808 return false;
809 }
810 return safe_parse_positive_int(text, value_p);
811 }
812
813 // ----------------------------------------------------------------------
814 // FastIntToBuffer()
815 // FastInt64ToBuffer()
816 // FastHexToBuffer()
817 // FastHex64ToBuffer()
818 // FastHex32ToBuffer()
819 // ----------------------------------------------------------------------
820
821 // Offset into buffer where FastInt64ToBuffer places the end of string
822 // null character. Also used by FastInt64ToBufferLeft.
823 static const int kFastInt64ToBufferOffset = 21;
824
FastInt64ToBuffer(int64 i,char * buffer)825 char *FastInt64ToBuffer(int64 i, char* buffer) {
826 // We could collapse the positive and negative sections, but that
827 // would be slightly slower for positive numbers...
828 // 22 bytes is enough to store -2**64, -18446744073709551616.
829 char* p = buffer + kFastInt64ToBufferOffset;
830 *p-- = '\0';
831 if (i >= 0) {
832 do {
833 *p-- = '0' + i % 10;
834 i /= 10;
835 } while (i > 0);
836 return p + 1;
837 } else {
838 // On different platforms, % and / have different behaviors for
839 // negative numbers, so we need to jump through hoops to make sure
840 // we don't divide negative numbers.
841 if (i > -10) {
842 i = -i;
843 *p-- = '0' + i;
844 *p = '-';
845 return p;
846 } else {
847 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
848 i = i + 10;
849 i = -i;
850 *p-- = '0' + i % 10;
851 // Undo what we did a moment ago
852 i = i / 10 + 1;
853 do {
854 *p-- = '0' + i % 10;
855 i /= 10;
856 } while (i > 0);
857 *p = '-';
858 return p;
859 }
860 }
861 }
862
863 // Offset into buffer where FastInt32ToBuffer places the end of string
864 // null character. Also used by FastInt32ToBufferLeft
865 static const int kFastInt32ToBufferOffset = 11;
866
867 // Yes, this is a duplicate of FastInt64ToBuffer. But, we need this for the
868 // compiler to generate 32 bit arithmetic instructions. It's much faster, at
869 // least with 32 bit binaries.
FastInt32ToBuffer(int32 i,char * buffer)870 char *FastInt32ToBuffer(int32 i, char* buffer) {
871 // We could collapse the positive and negative sections, but that
872 // would be slightly slower for positive numbers...
873 // 12 bytes is enough to store -2**32, -4294967296.
874 char* p = buffer + kFastInt32ToBufferOffset;
875 *p-- = '\0';
876 if (i >= 0) {
877 do {
878 *p-- = '0' + i % 10;
879 i /= 10;
880 } while (i > 0);
881 return p + 1;
882 } else {
883 // On different platforms, % and / have different behaviors for
884 // negative numbers, so we need to jump through hoops to make sure
885 // we don't divide negative numbers.
886 if (i > -10) {
887 i = -i;
888 *p-- = '0' + i;
889 *p = '-';
890 return p;
891 } else {
892 // Make sure we aren't at MIN_INT, in which case we can't say i = -i
893 i = i + 10;
894 i = -i;
895 *p-- = '0' + i % 10;
896 // Undo what we did a moment ago
897 i = i / 10 + 1;
898 do {
899 *p-- = '0' + i % 10;
900 i /= 10;
901 } while (i > 0);
902 *p = '-';
903 return p;
904 }
905 }
906 }
907
FastHexToBuffer(int i,char * buffer)908 char *FastHexToBuffer(int i, char* buffer) {
909 GOOGLE_CHECK(i >= 0) << "FastHexToBuffer() wants non-negative integers, not " << i;
910
911 static const char *hexdigits = "0123456789abcdef";
912 char *p = buffer + 21;
913 *p-- = '\0';
914 do {
915 *p-- = hexdigits[i & 15]; // mod by 16
916 i >>= 4; // divide by 16
917 } while (i > 0);
918 return p + 1;
919 }
920
InternalFastHexToBuffer(uint64 value,char * buffer,int num_byte)921 char *InternalFastHexToBuffer(uint64 value, char* buffer, int num_byte) {
922 static const char *hexdigits = "0123456789abcdef";
923 buffer[num_byte] = '\0';
924 for (int i = num_byte - 1; i >= 0; i--) {
925 #ifdef _M_X64
926 // MSVC x64 platform has a bug optimizing the uint32(value) in the #else
927 // block. Given that the uint32 cast was to improve performance on 32-bit
928 // platforms, we use 64-bit '&' directly.
929 buffer[i] = hexdigits[value & 0xf];
930 #else
931 buffer[i] = hexdigits[uint32(value) & 0xf];
932 #endif
933 value >>= 4;
934 }
935 return buffer;
936 }
937
FastHex64ToBuffer(uint64 value,char * buffer)938 char *FastHex64ToBuffer(uint64 value, char* buffer) {
939 return InternalFastHexToBuffer(value, buffer, 16);
940 }
941
FastHex32ToBuffer(uint32 value,char * buffer)942 char *FastHex32ToBuffer(uint32 value, char* buffer) {
943 return InternalFastHexToBuffer(value, buffer, 8);
944 }
945
946 // ----------------------------------------------------------------------
947 // FastInt32ToBufferLeft()
948 // FastUInt32ToBufferLeft()
949 // FastInt64ToBufferLeft()
950 // FastUInt64ToBufferLeft()
951 //
952 // Like the Fast*ToBuffer() functions above, these are intended for speed.
953 // Unlike the Fast*ToBuffer() functions, however, these functions write
954 // their output to the beginning of the buffer (hence the name, as the
955 // output is left-aligned). The caller is responsible for ensuring that
956 // the buffer has enough space to hold the output.
957 //
958 // Returns a pointer to the end of the string (i.e. the null character
959 // terminating the string).
960 // ----------------------------------------------------------------------
961
962 static const char two_ASCII_digits[100][2] = {
963 {'0','0'}, {'0','1'}, {'0','2'}, {'0','3'}, {'0','4'},
964 {'0','5'}, {'0','6'}, {'0','7'}, {'0','8'}, {'0','9'},
965 {'1','0'}, {'1','1'}, {'1','2'}, {'1','3'}, {'1','4'},
966 {'1','5'}, {'1','6'}, {'1','7'}, {'1','8'}, {'1','9'},
967 {'2','0'}, {'2','1'}, {'2','2'}, {'2','3'}, {'2','4'},
968 {'2','5'}, {'2','6'}, {'2','7'}, {'2','8'}, {'2','9'},
969 {'3','0'}, {'3','1'}, {'3','2'}, {'3','3'}, {'3','4'},
970 {'3','5'}, {'3','6'}, {'3','7'}, {'3','8'}, {'3','9'},
971 {'4','0'}, {'4','1'}, {'4','2'}, {'4','3'}, {'4','4'},
972 {'4','5'}, {'4','6'}, {'4','7'}, {'4','8'}, {'4','9'},
973 {'5','0'}, {'5','1'}, {'5','2'}, {'5','3'}, {'5','4'},
974 {'5','5'}, {'5','6'}, {'5','7'}, {'5','8'}, {'5','9'},
975 {'6','0'}, {'6','1'}, {'6','2'}, {'6','3'}, {'6','4'},
976 {'6','5'}, {'6','6'}, {'6','7'}, {'6','8'}, {'6','9'},
977 {'7','0'}, {'7','1'}, {'7','2'}, {'7','3'}, {'7','4'},
978 {'7','5'}, {'7','6'}, {'7','7'}, {'7','8'}, {'7','9'},
979 {'8','0'}, {'8','1'}, {'8','2'}, {'8','3'}, {'8','4'},
980 {'8','5'}, {'8','6'}, {'8','7'}, {'8','8'}, {'8','9'},
981 {'9','0'}, {'9','1'}, {'9','2'}, {'9','3'}, {'9','4'},
982 {'9','5'}, {'9','6'}, {'9','7'}, {'9','8'}, {'9','9'}
983 };
984
FastUInt32ToBufferLeft(uint32 u,char * buffer)985 char* FastUInt32ToBufferLeft(uint32 u, char* buffer) {
986 uint32 digits;
987 const char *ASCII_digits = nullptr;
988 // The idea of this implementation is to trim the number of divides to as few
989 // as possible by using multiplication and subtraction rather than mod (%),
990 // and by outputting two digits at a time rather than one.
991 // The huge-number case is first, in the hopes that the compiler will output
992 // that case in one branch-free block of code, and only output conditional
993 // branches into it from below.
994 if (u >= 1000000000) { // >= 1,000,000,000
995 digits = u / 100000000; // 100,000,000
996 ASCII_digits = two_ASCII_digits[digits];
997 buffer[0] = ASCII_digits[0];
998 buffer[1] = ASCII_digits[1];
999 buffer += 2;
1000 sublt100_000_000:
1001 u -= digits * 100000000; // 100,000,000
1002 lt100_000_000:
1003 digits = u / 1000000; // 1,000,000
1004 ASCII_digits = two_ASCII_digits[digits];
1005 buffer[0] = ASCII_digits[0];
1006 buffer[1] = ASCII_digits[1];
1007 buffer += 2;
1008 sublt1_000_000:
1009 u -= digits * 1000000; // 1,000,000
1010 lt1_000_000:
1011 digits = u / 10000; // 10,000
1012 ASCII_digits = two_ASCII_digits[digits];
1013 buffer[0] = ASCII_digits[0];
1014 buffer[1] = ASCII_digits[1];
1015 buffer += 2;
1016 sublt10_000:
1017 u -= digits * 10000; // 10,000
1018 lt10_000:
1019 digits = u / 100;
1020 ASCII_digits = two_ASCII_digits[digits];
1021 buffer[0] = ASCII_digits[0];
1022 buffer[1] = ASCII_digits[1];
1023 buffer += 2;
1024 sublt100:
1025 u -= digits * 100;
1026 lt100:
1027 digits = u;
1028 ASCII_digits = two_ASCII_digits[digits];
1029 buffer[0] = ASCII_digits[0];
1030 buffer[1] = ASCII_digits[1];
1031 buffer += 2;
1032 done:
1033 *buffer = 0;
1034 return buffer;
1035 }
1036
1037 if (u < 100) {
1038 digits = u;
1039 if (u >= 10) goto lt100;
1040 *buffer++ = '0' + digits;
1041 goto done;
1042 }
1043 if (u < 10000) { // 10,000
1044 if (u >= 1000) goto lt10_000;
1045 digits = u / 100;
1046 *buffer++ = '0' + digits;
1047 goto sublt100;
1048 }
1049 if (u < 1000000) { // 1,000,000
1050 if (u >= 100000) goto lt1_000_000;
1051 digits = u / 10000; // 10,000
1052 *buffer++ = '0' + digits;
1053 goto sublt10_000;
1054 }
1055 if (u < 100000000) { // 100,000,000
1056 if (u >= 10000000) goto lt100_000_000;
1057 digits = u / 1000000; // 1,000,000
1058 *buffer++ = '0' + digits;
1059 goto sublt1_000_000;
1060 }
1061 // we already know that u < 1,000,000,000
1062 digits = u / 100000000; // 100,000,000
1063 *buffer++ = '0' + digits;
1064 goto sublt100_000_000;
1065 }
1066
FastInt32ToBufferLeft(int32 i,char * buffer)1067 char* FastInt32ToBufferLeft(int32 i, char* buffer) {
1068 uint32 u = i;
1069 if (i < 0) {
1070 *buffer++ = '-';
1071 u = -i;
1072 }
1073 return FastUInt32ToBufferLeft(u, buffer);
1074 }
1075
FastUInt64ToBufferLeft(uint64 u64,char * buffer)1076 char* FastUInt64ToBufferLeft(uint64 u64, char* buffer) {
1077 int digits;
1078 const char *ASCII_digits = nullptr;
1079
1080 uint32 u = static_cast<uint32>(u64);
1081 if (u == u64) return FastUInt32ToBufferLeft(u, buffer);
1082
1083 uint64 top_11_digits = u64 / 1000000000;
1084 buffer = FastUInt64ToBufferLeft(top_11_digits, buffer);
1085 u = u64 - (top_11_digits * 1000000000);
1086
1087 digits = u / 10000000; // 10,000,000
1088 GOOGLE_DCHECK_LT(digits, 100);
1089 ASCII_digits = two_ASCII_digits[digits];
1090 buffer[0] = ASCII_digits[0];
1091 buffer[1] = ASCII_digits[1];
1092 buffer += 2;
1093 u -= digits * 10000000; // 10,000,000
1094 digits = u / 100000; // 100,000
1095 ASCII_digits = two_ASCII_digits[digits];
1096 buffer[0] = ASCII_digits[0];
1097 buffer[1] = ASCII_digits[1];
1098 buffer += 2;
1099 u -= digits * 100000; // 100,000
1100 digits = u / 1000; // 1,000
1101 ASCII_digits = two_ASCII_digits[digits];
1102 buffer[0] = ASCII_digits[0];
1103 buffer[1] = ASCII_digits[1];
1104 buffer += 2;
1105 u -= digits * 1000; // 1,000
1106 digits = u / 10;
1107 ASCII_digits = two_ASCII_digits[digits];
1108 buffer[0] = ASCII_digits[0];
1109 buffer[1] = ASCII_digits[1];
1110 buffer += 2;
1111 u -= digits * 10;
1112 digits = u;
1113 *buffer++ = '0' + digits;
1114 *buffer = 0;
1115 return buffer;
1116 }
1117
FastInt64ToBufferLeft(int64 i,char * buffer)1118 char* FastInt64ToBufferLeft(int64 i, char* buffer) {
1119 uint64 u = 0;
1120 if (i < 0) {
1121 *buffer++ = '-';
1122 u -= i;
1123 } else {
1124 u = i;
1125 }
1126 return FastUInt64ToBufferLeft(u, buffer);
1127 }
1128
1129 // ----------------------------------------------------------------------
1130 // SimpleItoa()
1131 // Description: converts an integer to a string.
1132 //
1133 // Return value: string
1134 // ----------------------------------------------------------------------
1135
SimpleItoa(int i)1136 string SimpleItoa(int i) {
1137 char buffer[kFastToBufferSize];
1138 return (sizeof(i) == 4) ?
1139 FastInt32ToBuffer(i, buffer) :
1140 FastInt64ToBuffer(i, buffer);
1141 }
1142
SimpleItoa(unsigned int i)1143 string SimpleItoa(unsigned int i) {
1144 char buffer[kFastToBufferSize];
1145 return string(buffer, (sizeof(i) == 4) ?
1146 FastUInt32ToBufferLeft(i, buffer) :
1147 FastUInt64ToBufferLeft(i, buffer));
1148 }
1149
SimpleItoa(long i)1150 string SimpleItoa(long i) {
1151 char buffer[kFastToBufferSize];
1152 return (sizeof(i) == 4) ?
1153 FastInt32ToBuffer(i, buffer) :
1154 FastInt64ToBuffer(i, buffer);
1155 }
1156
SimpleItoa(unsigned long i)1157 string SimpleItoa(unsigned long i) {
1158 char buffer[kFastToBufferSize];
1159 return string(buffer, (sizeof(i) == 4) ?
1160 FastUInt32ToBufferLeft(i, buffer) :
1161 FastUInt64ToBufferLeft(i, buffer));
1162 }
1163
SimpleItoa(long long i)1164 string SimpleItoa(long long i) {
1165 char buffer[kFastToBufferSize];
1166 return (sizeof(i) == 4) ?
1167 FastInt32ToBuffer(i, buffer) :
1168 FastInt64ToBuffer(i, buffer);
1169 }
1170
SimpleItoa(unsigned long long i)1171 string SimpleItoa(unsigned long long i) {
1172 char buffer[kFastToBufferSize];
1173 return string(buffer, (sizeof(i) == 4) ?
1174 FastUInt32ToBufferLeft(i, buffer) :
1175 FastUInt64ToBufferLeft(i, buffer));
1176 }
1177
1178 // ----------------------------------------------------------------------
1179 // SimpleDtoa()
1180 // SimpleFtoa()
1181 // DoubleToBuffer()
1182 // FloatToBuffer()
1183 // We want to print the value without losing precision, but we also do
1184 // not want to print more digits than necessary. This turns out to be
1185 // trickier than it sounds. Numbers like 0.2 cannot be represented
1186 // exactly in binary. If we print 0.2 with a very large precision,
1187 // e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
1188 // On the other hand, if we set the precision too low, we lose
1189 // significant digits when printing numbers that actually need them.
1190 // It turns out there is no precision value that does the right thing
1191 // for all numbers.
1192 //
1193 // Our strategy is to first try printing with a precision that is never
1194 // over-precise, then parse the result with strtod() to see if it
1195 // matches. If not, we print again with a precision that will always
1196 // give a precise result, but may use more digits than necessary.
1197 //
1198 // An arguably better strategy would be to use the algorithm described
1199 // in "How to Print Floating-Point Numbers Accurately" by Steele &
1200 // White, e.g. as implemented by David M. Gay's dtoa(). It turns out,
1201 // however, that the following implementation is about as fast as
1202 // DMG's code. Furthermore, DMG's code locks mutexes, which means it
1203 // will not scale well on multi-core machines. DMG's code is slightly
1204 // more accurate (in that it will never use more digits than
1205 // necessary), but this is probably irrelevant for most users.
1206 //
1207 // Rob Pike and Ken Thompson also have an implementation of dtoa() in
1208 // third_party/fmt/fltfmt.cc. Their implementation is similar to this
1209 // one in that it makes guesses and then uses strtod() to check them.
1210 // Their implementation is faster because they use their own code to
1211 // generate the digits in the first place rather than use snprintf(),
1212 // thus avoiding format string parsing overhead. However, this makes
1213 // it considerably more complicated than the following implementation,
1214 // and it is embedded in a larger library. If speed turns out to be
1215 // an issue, we could re-implement this in terms of their
1216 // implementation.
1217 // ----------------------------------------------------------------------
1218
SimpleDtoa(double value)1219 string SimpleDtoa(double value) {
1220 char buffer[kDoubleToBufferSize];
1221 return DoubleToBuffer(value, buffer);
1222 }
1223
SimpleFtoa(float value)1224 string SimpleFtoa(float value) {
1225 char buffer[kFloatToBufferSize];
1226 return FloatToBuffer(value, buffer);
1227 }
1228
IsValidFloatChar(char c)1229 static inline bool IsValidFloatChar(char c) {
1230 return ('0' <= c && c <= '9') ||
1231 c == 'e' || c == 'E' ||
1232 c == '+' || c == '-';
1233 }
1234
DelocalizeRadix(char * buffer)1235 void DelocalizeRadix(char* buffer) {
1236 // Fast check: if the buffer has a normal decimal point, assume no
1237 // translation is needed.
1238 if (strchr(buffer, '.') != nullptr) return;
1239
1240 // Find the first unknown character.
1241 while (IsValidFloatChar(*buffer)) ++buffer;
1242
1243 if (*buffer == '\0') {
1244 // No radix character found.
1245 return;
1246 }
1247
1248 // We are now pointing at the locale-specific radix character. Replace it
1249 // with '.'.
1250 *buffer = '.';
1251 ++buffer;
1252
1253 if (!IsValidFloatChar(*buffer) && *buffer != '\0') {
1254 // It appears the radix was a multi-byte character. We need to remove the
1255 // extra bytes.
1256 char* target = buffer;
1257 do { ++buffer; } while (!IsValidFloatChar(*buffer) && *buffer != '\0');
1258 memmove(target, buffer, strlen(buffer) + 1);
1259 }
1260 }
1261
DoubleToBuffer(double value,char * buffer)1262 char* DoubleToBuffer(double value, char* buffer) {
1263 // DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
1264 // platforms these days. Just in case some system exists where DBL_DIG
1265 // is significantly larger -- and risks overflowing our buffer -- we have
1266 // this assert.
1267 GOOGLE_COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
1268
1269 if (value == std::numeric_limits<double>::infinity()) {
1270 strcpy(buffer, "inf");
1271 return buffer;
1272 } else if (value == -std::numeric_limits<double>::infinity()) {
1273 strcpy(buffer, "-inf");
1274 return buffer;
1275 } else if (std::isnan(value)) {
1276 strcpy(buffer, "nan");
1277 return buffer;
1278 }
1279
1280 int snprintf_result =
1281 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG, value);
1282
1283 // The snprintf should never overflow because the buffer is significantly
1284 // larger than the precision we asked for.
1285 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1286
1287 // We need to make parsed_value volatile in order to force the compiler to
1288 // write it out to the stack. Otherwise, it may keep the value in a
1289 // register, and if it does that, it may keep it as a long double instead
1290 // of a double. This long double may have extra bits that make it compare
1291 // unequal to "value" even though it would be exactly equal if it were
1292 // truncated to a double.
1293 volatile double parsed_value = io::NoLocaleStrtod(buffer, nullptr);
1294 if (parsed_value != value) {
1295 int snprintf_result =
1296 snprintf(buffer, kDoubleToBufferSize, "%.*g", DBL_DIG+2, value);
1297
1298 // Should never overflow; see above.
1299 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kDoubleToBufferSize);
1300 }
1301
1302 DelocalizeRadix(buffer);
1303 return buffer;
1304 }
1305
memcasecmp(const char * s1,const char * s2,size_t len)1306 static int memcasecmp(const char *s1, const char *s2, size_t len) {
1307 const unsigned char *us1 = reinterpret_cast<const unsigned char *>(s1);
1308 const unsigned char *us2 = reinterpret_cast<const unsigned char *>(s2);
1309
1310 for ( int i = 0; i < len; i++ ) {
1311 const int diff =
1312 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us1[i]))) -
1313 static_cast<int>(static_cast<unsigned char>(ascii_tolower(us2[i])));
1314 if (diff != 0) return diff;
1315 }
1316 return 0;
1317 }
1318
CaseEqual(StringPiece s1,StringPiece s2)1319 inline bool CaseEqual(StringPiece s1, StringPiece s2) {
1320 if (s1.size() != s2.size()) return false;
1321 return memcasecmp(s1.data(), s2.data(), s1.size()) == 0;
1322 }
1323
safe_strtob(StringPiece str,bool * value)1324 bool safe_strtob(StringPiece str, bool* value) {
1325 GOOGLE_CHECK(value != nullptr) << "nullptr output boolean given.";
1326 if (CaseEqual(str, "true") || CaseEqual(str, "t") ||
1327 CaseEqual(str, "yes") || CaseEqual(str, "y") ||
1328 CaseEqual(str, "1")) {
1329 *value = true;
1330 return true;
1331 }
1332 if (CaseEqual(str, "false") || CaseEqual(str, "f") ||
1333 CaseEqual(str, "no") || CaseEqual(str, "n") ||
1334 CaseEqual(str, "0")) {
1335 *value = false;
1336 return true;
1337 }
1338 return false;
1339 }
1340
safe_strtof(const char * str,float * value)1341 bool safe_strtof(const char* str, float* value) {
1342 char* endptr;
1343 errno = 0; // errno only gets set on errors
1344 #if defined(_WIN32) || defined (__hpux) // has no strtof()
1345 *value = io::NoLocaleStrtod(str, &endptr);
1346 #else
1347 *value = strtof(str, &endptr);
1348 #endif
1349 return *str != 0 && *endptr == 0 && errno == 0;
1350 }
1351
safe_strtod(const char * str,double * value)1352 bool safe_strtod(const char* str, double* value) {
1353 char* endptr;
1354 *value = io::NoLocaleStrtod(str, &endptr);
1355 if (endptr != str) {
1356 while (ascii_isspace(*endptr)) ++endptr;
1357 }
1358 // Ignore range errors from strtod. The values it
1359 // returns on underflow and overflow are the right
1360 // fallback in a robust setting.
1361 return *str != '\0' && *endptr == '\0';
1362 }
1363
safe_strto32(const string & str,int32 * value)1364 bool safe_strto32(const string& str, int32* value) {
1365 return safe_int_internal(str, value);
1366 }
1367
safe_strtou32(const string & str,uint32 * value)1368 bool safe_strtou32(const string& str, uint32* value) {
1369 return safe_uint_internal(str, value);
1370 }
1371
safe_strto64(const string & str,int64 * value)1372 bool safe_strto64(const string& str, int64* value) {
1373 return safe_int_internal(str, value);
1374 }
1375
safe_strtou64(const string & str,uint64 * value)1376 bool safe_strtou64(const string& str, uint64* value) {
1377 return safe_uint_internal(str, value);
1378 }
1379
FloatToBuffer(float value,char * buffer)1380 char* FloatToBuffer(float value, char* buffer) {
1381 // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
1382 // platforms these days. Just in case some system exists where FLT_DIG
1383 // is significantly larger -- and risks overflowing our buffer -- we have
1384 // this assert.
1385 GOOGLE_COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
1386
1387 if (value == std::numeric_limits<double>::infinity()) {
1388 strcpy(buffer, "inf");
1389 return buffer;
1390 } else if (value == -std::numeric_limits<double>::infinity()) {
1391 strcpy(buffer, "-inf");
1392 return buffer;
1393 } else if (std::isnan(value)) {
1394 strcpy(buffer, "nan");
1395 return buffer;
1396 }
1397
1398 int snprintf_result =
1399 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG, value);
1400
1401 // The snprintf should never overflow because the buffer is significantly
1402 // larger than the precision we asked for.
1403 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1404
1405 float parsed_value;
1406 if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
1407 int snprintf_result =
1408 snprintf(buffer, kFloatToBufferSize, "%.*g", FLT_DIG+3, value);
1409
1410 // Should never overflow; see above.
1411 GOOGLE_DCHECK(snprintf_result > 0 && snprintf_result < kFloatToBufferSize);
1412 }
1413
1414 DelocalizeRadix(buffer);
1415 return buffer;
1416 }
1417
1418 namespace strings {
1419
AlphaNum(strings::Hex hex)1420 AlphaNum::AlphaNum(strings::Hex hex) {
1421 char *const end = &digits[kFastToBufferSize];
1422 char *writer = end;
1423 uint64 value = hex.value;
1424 uint64 width = hex.spec;
1425 // We accomplish minimum width by OR'ing in 0x10000 to the user's value,
1426 // where 0x10000 is the smallest hex number that is as wide as the user
1427 // asked for.
1428 uint64 mask = ((static_cast<uint64>(1) << (width - 1) * 4)) | value;
1429 static const char hexdigits[] = "0123456789abcdef";
1430 do {
1431 *--writer = hexdigits[value & 0xF];
1432 value >>= 4;
1433 mask >>= 4;
1434 } while (mask != 0);
1435 piece_data_ = writer;
1436 piece_size_ = end - writer;
1437 }
1438
1439 } // namespace strings
1440
1441 // ----------------------------------------------------------------------
1442 // StrCat()
1443 // This merges the given strings or integers, with no delimiter. This
1444 // is designed to be the fastest possible way to construct a string out
1445 // of a mix of raw C strings, C++ strings, and integer values.
1446 // ----------------------------------------------------------------------
1447
1448 // Append is merely a version of memcpy that returns the address of the byte
1449 // after the area just overwritten. It comes in multiple flavors to minimize
1450 // call overhead.
Append1(char * out,const AlphaNum & x)1451 static char *Append1(char *out, const AlphaNum &x) {
1452 memcpy(out, x.data(), x.size());
1453 return out + x.size();
1454 }
1455
Append2(char * out,const AlphaNum & x1,const AlphaNum & x2)1456 static char *Append2(char *out, const AlphaNum &x1, const AlphaNum &x2) {
1457 memcpy(out, x1.data(), x1.size());
1458 out += x1.size();
1459
1460 memcpy(out, x2.data(), x2.size());
1461 return out + x2.size();
1462 }
1463
Append4(char * out,const AlphaNum & x1,const AlphaNum & x2,const AlphaNum & x3,const AlphaNum & x4)1464 static char *Append4(char *out,
1465 const AlphaNum &x1, const AlphaNum &x2,
1466 const AlphaNum &x3, const AlphaNum &x4) {
1467 memcpy(out, x1.data(), x1.size());
1468 out += x1.size();
1469
1470 memcpy(out, x2.data(), x2.size());
1471 out += x2.size();
1472
1473 memcpy(out, x3.data(), x3.size());
1474 out += x3.size();
1475
1476 memcpy(out, x4.data(), x4.size());
1477 return out + x4.size();
1478 }
1479
StrCat(const AlphaNum & a,const AlphaNum & b)1480 string StrCat(const AlphaNum &a, const AlphaNum &b) {
1481 string result;
1482 result.resize(a.size() + b.size());
1483 char *const begin = &*result.begin();
1484 char *out = Append2(begin, a, b);
1485 GOOGLE_DCHECK_EQ(out, begin + result.size());
1486 return result;
1487 }
1488
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1489 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1490 string result;
1491 result.resize(a.size() + b.size() + c.size());
1492 char *const begin = &*result.begin();
1493 char *out = Append2(begin, a, b);
1494 out = Append1(out, c);
1495 GOOGLE_DCHECK_EQ(out, begin + result.size());
1496 return result;
1497 }
1498
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1499 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1500 const AlphaNum &d) {
1501 string result;
1502 result.resize(a.size() + b.size() + c.size() + d.size());
1503 char *const begin = &*result.begin();
1504 char *out = Append4(begin, a, b, c, d);
1505 GOOGLE_DCHECK_EQ(out, begin + result.size());
1506 return result;
1507 }
1508
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e)1509 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1510 const AlphaNum &d, const AlphaNum &e) {
1511 string result;
1512 result.resize(a.size() + b.size() + c.size() + d.size() + e.size());
1513 char *const begin = &*result.begin();
1514 char *out = Append4(begin, a, b, c, d);
1515 out = Append1(out, e);
1516 GOOGLE_DCHECK_EQ(out, begin + result.size());
1517 return result;
1518 }
1519
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f)1520 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1521 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f) {
1522 string result;
1523 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1524 f.size());
1525 char *const begin = &*result.begin();
1526 char *out = Append4(begin, a, b, c, d);
1527 out = Append2(out, e, f);
1528 GOOGLE_DCHECK_EQ(out, begin + result.size());
1529 return result;
1530 }
1531
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g)1532 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1533 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1534 const AlphaNum &g) {
1535 string result;
1536 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1537 f.size() + g.size());
1538 char *const begin = &*result.begin();
1539 char *out = Append4(begin, a, b, c, d);
1540 out = Append2(out, e, f);
1541 out = Append1(out, g);
1542 GOOGLE_DCHECK_EQ(out, begin + result.size());
1543 return result;
1544 }
1545
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h)1546 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1547 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1548 const AlphaNum &g, const AlphaNum &h) {
1549 string result;
1550 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1551 f.size() + g.size() + h.size());
1552 char *const begin = &*result.begin();
1553 char *out = Append4(begin, a, b, c, d);
1554 out = Append4(out, e, f, g, h);
1555 GOOGLE_DCHECK_EQ(out, begin + result.size());
1556 return result;
1557 }
1558
StrCat(const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d,const AlphaNum & e,const AlphaNum & f,const AlphaNum & g,const AlphaNum & h,const AlphaNum & i)1559 string StrCat(const AlphaNum &a, const AlphaNum &b, const AlphaNum &c,
1560 const AlphaNum &d, const AlphaNum &e, const AlphaNum &f,
1561 const AlphaNum &g, const AlphaNum &h, const AlphaNum &i) {
1562 string result;
1563 result.resize(a.size() + b.size() + c.size() + d.size() + e.size() +
1564 f.size() + g.size() + h.size() + i.size());
1565 char *const begin = &*result.begin();
1566 char *out = Append4(begin, a, b, c, d);
1567 out = Append4(out, e, f, g, h);
1568 out = Append1(out, i);
1569 GOOGLE_DCHECK_EQ(out, begin + result.size());
1570 return result;
1571 }
1572
1573 // It's possible to call StrAppend with a char * pointer that is partway into
1574 // the string we're appending to. However the results of this are random.
1575 // Therefore, check for this in debug mode. Use unsigned math so we only have
1576 // to do one comparison.
1577 #define GOOGLE_DCHECK_NO_OVERLAP(dest, src) \
1578 GOOGLE_DCHECK_GT(uintptr_t((src).data() - (dest).data()), \
1579 uintptr_t((dest).size()))
1580
StrAppend(string * result,const AlphaNum & a)1581 void StrAppend(string *result, const AlphaNum &a) {
1582 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1583 result->append(a.data(), a.size());
1584 }
1585
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b)1586 void StrAppend(string *result, const AlphaNum &a, const AlphaNum &b) {
1587 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1588 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1589 string::size_type old_size = result->size();
1590 result->resize(old_size + a.size() + b.size());
1591 char *const begin = &*result->begin();
1592 char *out = Append2(begin + old_size, a, b);
1593 GOOGLE_DCHECK_EQ(out, begin + result->size());
1594 }
1595
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c)1596 void StrAppend(string *result,
1597 const AlphaNum &a, const AlphaNum &b, const AlphaNum &c) {
1598 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1599 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1600 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1601 string::size_type old_size = result->size();
1602 result->resize(old_size + a.size() + b.size() + c.size());
1603 char *const begin = &*result->begin();
1604 char *out = Append2(begin + old_size, a, b);
1605 out = Append1(out, c);
1606 GOOGLE_DCHECK_EQ(out, begin + result->size());
1607 }
1608
StrAppend(string * result,const AlphaNum & a,const AlphaNum & b,const AlphaNum & c,const AlphaNum & d)1609 void StrAppend(string *result,
1610 const AlphaNum &a, const AlphaNum &b,
1611 const AlphaNum &c, const AlphaNum &d) {
1612 GOOGLE_DCHECK_NO_OVERLAP(*result, a);
1613 GOOGLE_DCHECK_NO_OVERLAP(*result, b);
1614 GOOGLE_DCHECK_NO_OVERLAP(*result, c);
1615 GOOGLE_DCHECK_NO_OVERLAP(*result, d);
1616 string::size_type old_size = result->size();
1617 result->resize(old_size + a.size() + b.size() + c.size() + d.size());
1618 char *const begin = &*result->begin();
1619 char *out = Append4(begin + old_size, a, b, c, d);
1620 GOOGLE_DCHECK_EQ(out, begin + result->size());
1621 }
1622
GlobalReplaceSubstring(const string & substring,const string & replacement,string * s)1623 int GlobalReplaceSubstring(const string& substring,
1624 const string& replacement,
1625 string* s) {
1626 GOOGLE_CHECK(s != nullptr);
1627 if (s->empty() || substring.empty())
1628 return 0;
1629 string tmp;
1630 int num_replacements = 0;
1631 int pos = 0;
1632 for (int match_pos = s->find(substring.data(), pos, substring.length());
1633 match_pos != string::npos;
1634 pos = match_pos + substring.length(),
1635 match_pos = s->find(substring.data(), pos, substring.length())) {
1636 ++num_replacements;
1637 // Append the original content before the match.
1638 tmp.append(*s, pos, match_pos - pos);
1639 // Append the replacement for the match.
1640 tmp.append(replacement.begin(), replacement.end());
1641 }
1642 // Append the content after the last match. If no replacements were made, the
1643 // original string is left untouched.
1644 if (num_replacements > 0) {
1645 tmp.append(*s, pos, s->length() - pos);
1646 s->swap(tmp);
1647 }
1648 return num_replacements;
1649 }
1650
CalculateBase64EscapedLen(int input_len,bool do_padding)1651 int CalculateBase64EscapedLen(int input_len, bool do_padding) {
1652 // Base64 encodes three bytes of input at a time. If the input is not
1653 // divisible by three, we pad as appropriate.
1654 //
1655 // (from http://tools.ietf.org/html/rfc3548)
1656 // Special processing is performed if fewer than 24 bits are available
1657 // at the end of the data being encoded. A full encoding quantum is
1658 // always completed at the end of a quantity. When fewer than 24 input
1659 // bits are available in an input group, zero bits are added (on the
1660 // right) to form an integral number of 6-bit groups. Padding at the
1661 // end of the data is performed using the '=' character. Since all base
1662 // 64 input is an integral number of octets, only the following cases
1663 // can arise:
1664
1665
1666 // Base64 encodes each three bytes of input into four bytes of output.
1667 int len = (input_len / 3) * 4;
1668
1669 if (input_len % 3 == 0) {
1670 // (from http://tools.ietf.org/html/rfc3548)
1671 // (1) the final quantum of encoding input is an integral multiple of 24
1672 // bits; here, the final unit of encoded output will be an integral
1673 // multiple of 4 characters with no "=" padding,
1674 } else if (input_len % 3 == 1) {
1675 // (from http://tools.ietf.org/html/rfc3548)
1676 // (2) the final quantum of encoding input is exactly 8 bits; here, the
1677 // final unit of encoded output will be two characters followed by two
1678 // "=" padding characters, or
1679 len += 2;
1680 if (do_padding) {
1681 len += 2;
1682 }
1683 } else { // (input_len % 3 == 2)
1684 // (from http://tools.ietf.org/html/rfc3548)
1685 // (3) the final quantum of encoding input is exactly 16 bits; here, the
1686 // final unit of encoded output will be three characters followed by one
1687 // "=" padding character.
1688 len += 3;
1689 if (do_padding) {
1690 len += 1;
1691 }
1692 }
1693
1694 assert(len >= input_len); // make sure we didn't overflow
1695 return len;
1696 }
1697
1698 // Base64Escape does padding, so this calculation includes padding.
CalculateBase64EscapedLen(int input_len)1699 int CalculateBase64EscapedLen(int input_len) {
1700 return CalculateBase64EscapedLen(input_len, true);
1701 }
1702
1703 // ----------------------------------------------------------------------
1704 // int Base64Unescape() - base64 decoder
1705 // int Base64Escape() - base64 encoder
1706 // int WebSafeBase64Unescape() - Google's variation of base64 decoder
1707 // int WebSafeBase64Escape() - Google's variation of base64 encoder
1708 //
1709 // Check out
1710 // http://tools.ietf.org/html/rfc2045 for formal description, but what we
1711 // care about is that...
1712 // Take the encoded stuff in groups of 4 characters and turn each
1713 // character into a code 0 to 63 thus:
1714 // A-Z map to 0 to 25
1715 // a-z map to 26 to 51
1716 // 0-9 map to 52 to 61
1717 // +(- for WebSafe) maps to 62
1718 // /(_ for WebSafe) maps to 63
1719 // There will be four numbers, all less than 64 which can be represented
1720 // by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
1721 // Arrange the 6 digit binary numbers into three bytes as such:
1722 // aaaaaabb bbbbcccc ccdddddd
1723 // Equals signs (one or two) are used at the end of the encoded block to
1724 // indicate that the text was not an integer multiple of three bytes long.
1725 // ----------------------------------------------------------------------
1726
Base64UnescapeInternal(const char * src_param,int szsrc,char * dest,int szdest,const signed char * unbase64)1727 int Base64UnescapeInternal(const char *src_param, int szsrc,
1728 char *dest, int szdest,
1729 const signed char* unbase64) {
1730 static const char kPad64Equals = '=';
1731 static const char kPad64Dot = '.';
1732
1733 int decode = 0;
1734 int destidx = 0;
1735 int state = 0;
1736 unsigned int ch = 0;
1737 unsigned int temp = 0;
1738
1739 // If "char" is signed by default, using *src as an array index results in
1740 // accessing negative array elements. Treat the input as a pointer to
1741 // unsigned char to avoid this.
1742 const unsigned char *src = reinterpret_cast<const unsigned char*>(src_param);
1743
1744 // The GET_INPUT macro gets the next input character, skipping
1745 // over any whitespace, and stopping when we reach the end of the
1746 // string or when we read any non-data character. The arguments are
1747 // an arbitrary identifier (used as a label for goto) and the number
1748 // of data bytes that must remain in the input to avoid aborting the
1749 // loop.
1750 #define GET_INPUT(label, remain) \
1751 label: \
1752 --szsrc; \
1753 ch = *src++; \
1754 decode = unbase64[ch]; \
1755 if (decode < 0) { \
1756 if (ascii_isspace(ch) && szsrc >= remain) \
1757 goto label; \
1758 state = 4 - remain; \
1759 break; \
1760 }
1761
1762 // if dest is null, we're just checking to see if it's legal input
1763 // rather than producing output. (I suspect this could just be done
1764 // with a regexp...). We duplicate the loop so this test can be
1765 // outside it instead of in every iteration.
1766
1767 if (dest) {
1768 // This loop consumes 4 input bytes and produces 3 output bytes
1769 // per iteration. We can't know at the start that there is enough
1770 // data left in the string for a full iteration, so the loop may
1771 // break out in the middle; if so 'state' will be set to the
1772 // number of input bytes read.
1773
1774 while (szsrc >= 4) {
1775 // We'll start by optimistically assuming that the next four
1776 // bytes of the string (src[0..3]) are four good data bytes
1777 // (that is, no nulls, whitespace, padding chars, or illegal
1778 // chars). We need to test src[0..2] for nulls individually
1779 // before constructing temp to preserve the property that we
1780 // never read past a null in the string (no matter how long
1781 // szsrc claims the string is).
1782
1783 if (!src[0] || !src[1] || !src[2] ||
1784 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1785 (unsigned(unbase64[src[1]]) << 12) |
1786 (unsigned(unbase64[src[2]]) << 6) |
1787 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1788 // Iff any of those four characters was bad (null, illegal,
1789 // whitespace, padding), then temp's high bit will be set
1790 // (because unbase64[] is -1 for all bad characters).
1791 //
1792 // We'll back up and resort to the slower decoder, which knows
1793 // how to handle those cases.
1794
1795 GET_INPUT(first, 4);
1796 temp = decode;
1797 GET_INPUT(second, 3);
1798 temp = (temp << 6) | decode;
1799 GET_INPUT(third, 2);
1800 temp = (temp << 6) | decode;
1801 GET_INPUT(fourth, 1);
1802 temp = (temp << 6) | decode;
1803 } else {
1804 // We really did have four good data bytes, so advance four
1805 // characters in the string.
1806
1807 szsrc -= 4;
1808 src += 4;
1809 decode = -1;
1810 ch = '\0';
1811 }
1812
1813 // temp has 24 bits of input, so write that out as three bytes.
1814
1815 if (destidx+3 > szdest) return -1;
1816 dest[destidx+2] = temp;
1817 temp >>= 8;
1818 dest[destidx+1] = temp;
1819 temp >>= 8;
1820 dest[destidx] = temp;
1821 destidx += 3;
1822 }
1823 } else {
1824 while (szsrc >= 4) {
1825 if (!src[0] || !src[1] || !src[2] ||
1826 (temp = ((unsigned(unbase64[src[0]]) << 18) |
1827 (unsigned(unbase64[src[1]]) << 12) |
1828 (unsigned(unbase64[src[2]]) << 6) |
1829 (unsigned(unbase64[src[3]])))) & 0x80000000) {
1830 GET_INPUT(first_no_dest, 4);
1831 GET_INPUT(second_no_dest, 3);
1832 GET_INPUT(third_no_dest, 2);
1833 GET_INPUT(fourth_no_dest, 1);
1834 } else {
1835 szsrc -= 4;
1836 src += 4;
1837 decode = -1;
1838 ch = '\0';
1839 }
1840 destidx += 3;
1841 }
1842 }
1843
1844 #undef GET_INPUT
1845
1846 // if the loop terminated because we read a bad character, return
1847 // now.
1848 if (decode < 0 && ch != '\0' &&
1849 ch != kPad64Equals && ch != kPad64Dot && !ascii_isspace(ch))
1850 return -1;
1851
1852 if (ch == kPad64Equals || ch == kPad64Dot) {
1853 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
1854 // look at it again when we count to check for the proper number of
1855 // equals signs at the end.
1856 ++szsrc;
1857 --src;
1858 } else {
1859 // This loop consumes 1 input byte per iteration. It's used to
1860 // clean up the 0-3 input bytes remaining when the first, faster
1861 // loop finishes. 'temp' contains the data from 'state' input
1862 // characters read by the first loop.
1863 while (szsrc > 0) {
1864 --szsrc;
1865 ch = *src++;
1866 decode = unbase64[ch];
1867 if (decode < 0) {
1868 if (ascii_isspace(ch)) {
1869 continue;
1870 } else if (ch == '\0') {
1871 break;
1872 } else if (ch == kPad64Equals || ch == kPad64Dot) {
1873 // back up one character; we'll read it again when we check
1874 // for the correct number of pad characters at the end.
1875 ++szsrc;
1876 --src;
1877 break;
1878 } else {
1879 return -1;
1880 }
1881 }
1882
1883 // Each input character gives us six bits of output.
1884 temp = (temp << 6) | decode;
1885 ++state;
1886 if (state == 4) {
1887 // If we've accumulated 24 bits of output, write that out as
1888 // three bytes.
1889 if (dest) {
1890 if (destidx+3 > szdest) return -1;
1891 dest[destidx+2] = temp;
1892 temp >>= 8;
1893 dest[destidx+1] = temp;
1894 temp >>= 8;
1895 dest[destidx] = temp;
1896 }
1897 destidx += 3;
1898 state = 0;
1899 temp = 0;
1900 }
1901 }
1902 }
1903
1904 // Process the leftover data contained in 'temp' at the end of the input.
1905 int expected_equals = 0;
1906 switch (state) {
1907 case 0:
1908 // Nothing left over; output is a multiple of 3 bytes.
1909 break;
1910
1911 case 1:
1912 // Bad input; we have 6 bits left over.
1913 return -1;
1914
1915 case 2:
1916 // Produce one more output byte from the 12 input bits we have left.
1917 if (dest) {
1918 if (destidx+1 > szdest) return -1;
1919 temp >>= 4;
1920 dest[destidx] = temp;
1921 }
1922 ++destidx;
1923 expected_equals = 2;
1924 break;
1925
1926 case 3:
1927 // Produce two more output bytes from the 18 input bits we have left.
1928 if (dest) {
1929 if (destidx+2 > szdest) return -1;
1930 temp >>= 2;
1931 dest[destidx+1] = temp;
1932 temp >>= 8;
1933 dest[destidx] = temp;
1934 }
1935 destidx += 2;
1936 expected_equals = 1;
1937 break;
1938
1939 default:
1940 // state should have no other values at this point.
1941 GOOGLE_LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1942 }
1943
1944 // The remainder of the string should be all whitespace, mixed with
1945 // exactly 0 equals signs, or exactly 'expected_equals' equals
1946 // signs. (Always accepting 0 equals signs is a google extension
1947 // not covered in the RFC, as is accepting dot as the pad character.)
1948
1949 int equals = 0;
1950 while (szsrc > 0 && *src) {
1951 if (*src == kPad64Equals || *src == kPad64Dot)
1952 ++equals;
1953 else if (!ascii_isspace(*src))
1954 return -1;
1955 --szsrc;
1956 ++src;
1957 }
1958
1959 return (equals == 0 || equals == expected_equals) ? destidx : -1;
1960 }
1961
1962 // The arrays below were generated by the following code
1963 // #include <sys/time.h>
1964 // #include <stdlib.h>
1965 // #include <string.h>
1966 // #include <stdio.h>
1967 // main()
1968 // {
1969 // static const char Base64[] =
1970 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1971 // const char *pos;
1972 // int idx, i, j;
1973 // printf(" ");
1974 // for (i = 0; i < 255; i += 8) {
1975 // for (j = i; j < i + 8; j++) {
1976 // pos = strchr(Base64, j);
1977 // if ((pos == nullptr) || (j == 0))
1978 // idx = -1;
1979 // else
1980 // idx = pos - Base64;
1981 // if (idx == -1)
1982 // printf(" %2d, ", idx);
1983 // else
1984 // printf(" %2d/""*%c*""/,", idx, j);
1985 // }
1986 // printf("\n ");
1987 // }
1988 // }
1989 //
1990 // where the value of "Base64[]" was replaced by one of the base-64 conversion
1991 // tables from the functions below.
1992 static const signed char kUnBase64[] = {
1993 -1, -1, -1, -1, -1, -1, -1, -1,
1994 -1, -1, -1, -1, -1, -1, -1, -1,
1995 -1, -1, -1, -1, -1, -1, -1, -1,
1996 -1, -1, -1, -1, -1, -1, -1, -1,
1997 -1, -1, -1, -1, -1, -1, -1, -1,
1998 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
1999 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2000 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
2001 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
2002 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2003 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2004 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
2005 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2006 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2007 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2008 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
2009 -1, -1, -1, -1, -1, -1, -1, -1,
2010 -1, -1, -1, -1, -1, -1, -1, -1,
2011 -1, -1, -1, -1, -1, -1, -1, -1,
2012 -1, -1, -1, -1, -1, -1, -1, -1,
2013 -1, -1, -1, -1, -1, -1, -1, -1,
2014 -1, -1, -1, -1, -1, -1, -1, -1,
2015 -1, -1, -1, -1, -1, -1, -1, -1,
2016 -1, -1, -1, -1, -1, -1, -1, -1,
2017 -1, -1, -1, -1, -1, -1, -1, -1,
2018 -1, -1, -1, -1, -1, -1, -1, -1,
2019 -1, -1, -1, -1, -1, -1, -1, -1,
2020 -1, -1, -1, -1, -1, -1, -1, -1,
2021 -1, -1, -1, -1, -1, -1, -1, -1,
2022 -1, -1, -1, -1, -1, -1, -1, -1,
2023 -1, -1, -1, -1, -1, -1, -1, -1,
2024 -1, -1, -1, -1, -1, -1, -1, -1
2025 };
2026 static const signed char kUnWebSafeBase64[] = {
2027 -1, -1, -1, -1, -1, -1, -1, -1,
2028 -1, -1, -1, -1, -1, -1, -1, -1,
2029 -1, -1, -1, -1, -1, -1, -1, -1,
2030 -1, -1, -1, -1, -1, -1, -1, -1,
2031 -1, -1, -1, -1, -1, -1, -1, -1,
2032 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
2033 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
2034 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
2035 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
2036 7/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
2037 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
2038 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
2039 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
2040 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
2041 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
2042 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
2043 -1, -1, -1, -1, -1, -1, -1, -1,
2044 -1, -1, -1, -1, -1, -1, -1, -1,
2045 -1, -1, -1, -1, -1, -1, -1, -1,
2046 -1, -1, -1, -1, -1, -1, -1, -1,
2047 -1, -1, -1, -1, -1, -1, -1, -1,
2048 -1, -1, -1, -1, -1, -1, -1, -1,
2049 -1, -1, -1, -1, -1, -1, -1, -1,
2050 -1, -1, -1, -1, -1, -1, -1, -1,
2051 -1, -1, -1, -1, -1, -1, -1, -1,
2052 -1, -1, -1, -1, -1, -1, -1, -1,
2053 -1, -1, -1, -1, -1, -1, -1, -1,
2054 -1, -1, -1, -1, -1, -1, -1, -1,
2055 -1, -1, -1, -1, -1, -1, -1, -1,
2056 -1, -1, -1, -1, -1, -1, -1, -1,
2057 -1, -1, -1, -1, -1, -1, -1, -1,
2058 -1, -1, -1, -1, -1, -1, -1, -1
2059 };
2060
WebSafeBase64Unescape(const char * src,int szsrc,char * dest,int szdest)2061 int WebSafeBase64Unescape(const char *src, int szsrc, char *dest, int szdest) {
2062 return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
2063 }
2064
Base64UnescapeInternal(const char * src,int slen,string * dest,const signed char * unbase64)2065 static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
2066 const signed char* unbase64) {
2067 // Determine the size of the output string. Base64 encodes every 3 bytes into
2068 // 4 characters. any leftover chars are added directly for good measure.
2069 // This is documented in the base64 RFC: http://tools.ietf.org/html/rfc3548
2070 const int dest_len = 3 * (slen / 4) + (slen % 4);
2071
2072 dest->resize(dest_len);
2073
2074 // We are getting the destination buffer by getting the beginning of the
2075 // string and converting it into a char *.
2076 const int len = Base64UnescapeInternal(src, slen, string_as_array(dest),
2077 dest_len, unbase64);
2078 if (len < 0) {
2079 dest->clear();
2080 return false;
2081 }
2082
2083 // could be shorter if there was padding
2084 GOOGLE_DCHECK_LE(len, dest_len);
2085 dest->erase(len);
2086
2087 return true;
2088 }
2089
Base64Unescape(StringPiece src,string * dest)2090 bool Base64Unescape(StringPiece src, string* dest) {
2091 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
2092 }
2093
WebSafeBase64Unescape(StringPiece src,string * dest)2094 bool WebSafeBase64Unescape(StringPiece src, string* dest) {
2095 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
2096 }
2097
Base64EscapeInternal(const unsigned char * src,int szsrc,char * dest,int szdest,const char * base64,bool do_padding)2098 int Base64EscapeInternal(const unsigned char *src, int szsrc,
2099 char *dest, int szdest, const char *base64,
2100 bool do_padding) {
2101 static const char kPad64 = '=';
2102
2103 if (szsrc <= 0) return 0;
2104
2105 if (szsrc * 4 > szdest * 3) return 0;
2106
2107 char *cur_dest = dest;
2108 const unsigned char *cur_src = src;
2109
2110 char *limit_dest = dest + szdest;
2111 const unsigned char *limit_src = src + szsrc;
2112
2113 // Three bytes of data encodes to four characters of cyphertext.
2114 // So we can pump through three-byte chunks atomically.
2115 while (cur_src < limit_src - 3) { // keep going as long as we have >= 32 bits
2116 uint32 in = BigEndian::Load32(cur_src) >> 8;
2117
2118 cur_dest[0] = base64[in >> 18];
2119 in &= 0x3FFFF;
2120 cur_dest[1] = base64[in >> 12];
2121 in &= 0xFFF;
2122 cur_dest[2] = base64[in >> 6];
2123 in &= 0x3F;
2124 cur_dest[3] = base64[in];
2125
2126 cur_dest += 4;
2127 cur_src += 3;
2128 }
2129 // To save time, we didn't update szdest or szsrc in the loop. So do it now.
2130 szdest = limit_dest - cur_dest;
2131 szsrc = limit_src - cur_src;
2132
2133 /* now deal with the tail (<=3 bytes) */
2134 switch (szsrc) {
2135 case 0:
2136 // Nothing left; nothing more to do.
2137 break;
2138 case 1: {
2139 // One byte left: this encodes to two characters, and (optionally)
2140 // two pad characters to round out the four-character cypherblock.
2141 if ((szdest -= 2) < 0) return 0;
2142 uint32 in = cur_src[0];
2143 cur_dest[0] = base64[in >> 2];
2144 in &= 0x3;
2145 cur_dest[1] = base64[in << 4];
2146 cur_dest += 2;
2147 if (do_padding) {
2148 if ((szdest -= 2) < 0) return 0;
2149 cur_dest[0] = kPad64;
2150 cur_dest[1] = kPad64;
2151 cur_dest += 2;
2152 }
2153 break;
2154 }
2155 case 2: {
2156 // Two bytes left: this encodes to three characters, and (optionally)
2157 // one pad character to round out the four-character cypherblock.
2158 if ((szdest -= 3) < 0) return 0;
2159 uint32 in = BigEndian::Load16(cur_src);
2160 cur_dest[0] = base64[in >> 10];
2161 in &= 0x3FF;
2162 cur_dest[1] = base64[in >> 4];
2163 in &= 0x00F;
2164 cur_dest[2] = base64[in << 2];
2165 cur_dest += 3;
2166 if (do_padding) {
2167 if ((szdest -= 1) < 0) return 0;
2168 cur_dest[0] = kPad64;
2169 cur_dest += 1;
2170 }
2171 break;
2172 }
2173 case 3: {
2174 // Three bytes left: same as in the big loop above. We can't do this in
2175 // the loop because the loop above always reads 4 bytes, and the fourth
2176 // byte is past the end of the input.
2177 if ((szdest -= 4) < 0) return 0;
2178 uint32 in = (cur_src[0] << 16) + BigEndian::Load16(cur_src + 1);
2179 cur_dest[0] = base64[in >> 18];
2180 in &= 0x3FFFF;
2181 cur_dest[1] = base64[in >> 12];
2182 in &= 0xFFF;
2183 cur_dest[2] = base64[in >> 6];
2184 in &= 0x3F;
2185 cur_dest[3] = base64[in];
2186 cur_dest += 4;
2187 break;
2188 }
2189 default:
2190 // Should not be reached: blocks of 4 bytes are handled
2191 // in the while loop before this switch statement.
2192 GOOGLE_LOG(FATAL) << "Logic problem? szsrc = " << szsrc;
2193 break;
2194 }
2195 return (cur_dest - dest);
2196 }
2197
2198 static const char kBase64Chars[] =
2199 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2200
2201 static const char kWebSafeBase64Chars[] =
2202 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
2203
Base64Escape(const unsigned char * src,int szsrc,char * dest,int szdest)2204 int Base64Escape(const unsigned char *src, int szsrc, char *dest, int szdest) {
2205 return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
2206 }
WebSafeBase64Escape(const unsigned char * src,int szsrc,char * dest,int szdest,bool do_padding)2207 int WebSafeBase64Escape(const unsigned char *src, int szsrc, char *dest,
2208 int szdest, bool do_padding) {
2209 return Base64EscapeInternal(src, szsrc, dest, szdest,
2210 kWebSafeBase64Chars, do_padding);
2211 }
2212
Base64EscapeInternal(const unsigned char * src,int szsrc,string * dest,bool do_padding,const char * base64_chars)2213 void Base64EscapeInternal(const unsigned char* src, int szsrc,
2214 string* dest, bool do_padding,
2215 const char* base64_chars) {
2216 const int calc_escaped_size =
2217 CalculateBase64EscapedLen(szsrc, do_padding);
2218 dest->resize(calc_escaped_size);
2219 const int escaped_len = Base64EscapeInternal(src, szsrc,
2220 string_as_array(dest),
2221 dest->size(),
2222 base64_chars,
2223 do_padding);
2224 GOOGLE_DCHECK_EQ(calc_escaped_size, escaped_len);
2225 dest->erase(escaped_len);
2226 }
2227
Base64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2228 void Base64Escape(const unsigned char *src, int szsrc,
2229 string* dest, bool do_padding) {
2230 Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
2231 }
2232
WebSafeBase64Escape(const unsigned char * src,int szsrc,string * dest,bool do_padding)2233 void WebSafeBase64Escape(const unsigned char *src, int szsrc,
2234 string *dest, bool do_padding) {
2235 Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
2236 }
2237
Base64Escape(StringPiece src,string * dest)2238 void Base64Escape(StringPiece src, string* dest) {
2239 Base64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2240 src.size(), dest, true);
2241 }
2242
WebSafeBase64Escape(StringPiece src,string * dest)2243 void WebSafeBase64Escape(StringPiece src, string* dest) {
2244 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2245 src.size(), dest, false);
2246 }
2247
WebSafeBase64EscapeWithPadding(StringPiece src,string * dest)2248 void WebSafeBase64EscapeWithPadding(StringPiece src, string* dest) {
2249 WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()),
2250 src.size(), dest, true);
2251 }
2252
2253 // Helper to append a Unicode code point to a string as UTF8, without bringing
2254 // in any external dependencies.
EncodeAsUTF8Char(uint32 code_point,char * output)2255 int EncodeAsUTF8Char(uint32 code_point, char* output) {
2256 uint32 tmp = 0;
2257 int len = 0;
2258 if (code_point <= 0x7f) {
2259 tmp = code_point;
2260 len = 1;
2261 } else if (code_point <= 0x07ff) {
2262 tmp = 0x0000c080 |
2263 ((code_point & 0x07c0) << 2) |
2264 (code_point & 0x003f);
2265 len = 2;
2266 } else if (code_point <= 0xffff) {
2267 tmp = 0x00e08080 |
2268 ((code_point & 0xf000) << 4) |
2269 ((code_point & 0x0fc0) << 2) |
2270 (code_point & 0x003f);
2271 len = 3;
2272 } else {
2273 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
2274 // normally only defined up to there as well.
2275 tmp = 0xf0808080 |
2276 ((code_point & 0x1c0000) << 6) |
2277 ((code_point & 0x03f000) << 4) |
2278 ((code_point & 0x000fc0) << 2) |
2279 (code_point & 0x003f);
2280 len = 4;
2281 }
2282 tmp = ghtonl(tmp);
2283 memcpy(output, reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
2284 return len;
2285 }
2286
2287 // Table of UTF-8 character lengths, based on first byte
2288 static const unsigned char kUTF8LenTbl[256] = {
2289 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2290 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2291 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2292 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2293
2294 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2295 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,
2296 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,
2297 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4
2298 };
2299
2300 // Return length of a single UTF-8 source character
UTF8FirstLetterNumBytes(const char * src,int len)2301 int UTF8FirstLetterNumBytes(const char* src, int len) {
2302 if (len == 0) {
2303 return 0;
2304 }
2305 return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)];
2306 }
2307
2308 // ----------------------------------------------------------------------
2309 // CleanStringLineEndings()
2310 // Clean up a multi-line string to conform to Unix line endings.
2311 // Reads from src and appends to dst, so usually dst should be empty.
2312 //
2313 // If there is no line ending at the end of a non-empty string, it can
2314 // be added automatically.
2315 //
2316 // Four different types of input are correctly handled:
2317 //
2318 // - Unix/Linux files: line ending is LF: pass through unchanged
2319 //
2320 // - DOS/Windows files: line ending is CRLF: convert to LF
2321 //
2322 // - Legacy Mac files: line ending is CR: convert to LF
2323 //
2324 // - Garbled files: random line endings: convert gracefully
2325 // lonely CR, lonely LF, CRLF: convert to LF
2326 //
2327 // @param src The multi-line string to convert
2328 // @param dst The converted string is appended to this string
2329 // @param auto_end_last_line Automatically terminate the last line
2330 //
2331 // Limitations:
2332 //
2333 // This does not do the right thing for CRCRLF files created by
2334 // broken programs that do another Unix->DOS conversion on files
2335 // that are already in CRLF format. For this, a two-pass approach
2336 // brute-force would be needed that
2337 //
2338 // (1) determines the presence of LF (first one is ok)
2339 // (2) if yes, removes any CR, else convert every CR to LF
2340
CleanStringLineEndings(const string & src,string * dst,bool auto_end_last_line)2341 void CleanStringLineEndings(const string &src, string *dst,
2342 bool auto_end_last_line) {
2343 if (dst->empty()) {
2344 dst->append(src);
2345 CleanStringLineEndings(dst, auto_end_last_line);
2346 } else {
2347 string tmp = src;
2348 CleanStringLineEndings(&tmp, auto_end_last_line);
2349 dst->append(tmp);
2350 }
2351 }
2352
CleanStringLineEndings(string * str,bool auto_end_last_line)2353 void CleanStringLineEndings(string *str, bool auto_end_last_line) {
2354 ptrdiff_t output_pos = 0;
2355 bool r_seen = false;
2356 ptrdiff_t len = str->size();
2357
2358 char *p = &(*str)[0];
2359
2360 for (ptrdiff_t input_pos = 0; input_pos < len;) {
2361 if (!r_seen && input_pos + 8 < len) {
2362 uint64_t v = GOOGLE_UNALIGNED_LOAD64(p + input_pos);
2363 // Loop over groups of 8 bytes at a time until we come across
2364 // a word that has a byte whose value is less than or equal to
2365 // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
2366 //
2367 // We use a has_less macro that quickly tests a whole 64-bit
2368 // word to see if any of the bytes has a value < N.
2369 //
2370 // For more details, see:
2371 // http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
2372 #define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128)
2373 if (!has_less(v, '\r' + 1)) {
2374 #undef has_less
2375 // No byte in this word has a value that could be a \r or a \n
2376 if (output_pos != input_pos) {
2377 GOOGLE_UNALIGNED_STORE64(p + output_pos, v);
2378 }
2379 input_pos += 8;
2380 output_pos += 8;
2381 continue;
2382 }
2383 }
2384 string::const_reference in = p[input_pos];
2385 if (in == '\r') {
2386 if (r_seen) p[output_pos++] = '\n';
2387 r_seen = true;
2388 } else if (in == '\n') {
2389 if (input_pos != output_pos)
2390 p[output_pos++] = '\n';
2391 else
2392 output_pos++;
2393 r_seen = false;
2394 } else {
2395 if (r_seen) p[output_pos++] = '\n';
2396 r_seen = false;
2397 if (input_pos != output_pos)
2398 p[output_pos++] = in;
2399 else
2400 output_pos++;
2401 }
2402 input_pos++;
2403 }
2404 if (r_seen ||
2405 (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) {
2406 str->resize(output_pos + 1);
2407 str->operator[](output_pos) = '\n';
2408 } else if (output_pos < len) {
2409 str->resize(output_pos);
2410 }
2411 }
2412
2413 } // namespace protobuf
2414 } // namespace google
2415