• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/escaping.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 #include <limits>
23 #include <string>
24 
25 #include "absl/base/config.h"
26 #include "absl/base/internal/raw_logging.h"
27 #include "absl/base/internal/unaligned_access.h"
28 #include "absl/strings/ascii.h"
29 #include "absl/strings/charset.h"
30 #include "absl/strings/internal/escaping.h"
31 #include "absl/strings/internal/resize_uninitialized.h"
32 #include "absl/strings/internal/utf8.h"
33 #include "absl/strings/numbers.h"
34 #include "absl/strings/str_cat.h"
35 #include "absl/strings/string_view.h"
36 
37 namespace absl {
38 ABSL_NAMESPACE_BEGIN
39 namespace {
40 
41 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
42 constexpr bool kUnescapeNulls = false;
43 
is_octal_digit(char c)44 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
45 
hex_digit_to_int(char c)46 inline unsigned int hex_digit_to_int(char c) {
47   static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
48                 "Character set must be ASCII.");
49   assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
50   unsigned int x = static_cast<unsigned char>(c);
51   if (x > '9') {
52     x += 9;
53   }
54   return x & 0xf;
55 }
56 
IsSurrogate(char32_t c,absl::string_view src,std::string * error)57 inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
58   if (c >= 0xD800 && c <= 0xDFFF) {
59     if (error) {
60       *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
61                             src);
62     }
63     return true;
64   }
65   return false;
66 }
67 
68 // ----------------------------------------------------------------------
69 // CUnescapeInternal()
70 //    Implements both CUnescape() and CUnescapeForNullTerminatedString().
71 //
72 //    Unescapes C escape sequences and is the reverse of CEscape().
73 //
74 //    If 'source' is valid, stores the unescaped string and its size in
75 //    'dest' and 'dest_len' respectively, and returns true. Otherwise
76 //    returns false and optionally stores the error description in
77 //    'error'. Set 'error' to nullptr to disable error reporting.
78 //
79 //    'dest' should point to a buffer that is at least as big as 'source'.
80 //    'source' and 'dest' may be the same.
81 //
82 //     NOTE: any changes to this function must also be reflected in the older
83 //     UnescapeCEscapeSequences().
84 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,char * dest,ptrdiff_t * dest_len,std::string * error)85 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
86                        char* dest, ptrdiff_t* dest_len, std::string* error) {
87   char* d = dest;
88   const char* p = source.data();
89   const char* end = p + source.size();
90   const char* last_byte = end - 1;
91 
92   // Small optimization for case where source = dest and there's no escaping
93   while (p == d && p < end && *p != '\\') p++, d++;
94 
95   while (p < end) {
96     if (*p != '\\') {
97       *d++ = *p++;
98     } else {
99       if (++p > last_byte) {  // skip past the '\\'
100         if (error) *error = "String cannot end with \\";
101         return false;
102       }
103       switch (*p) {
104         case 'a':  *d++ = '\a';  break;
105         case 'b':  *d++ = '\b';  break;
106         case 'f':  *d++ = '\f';  break;
107         case 'n':  *d++ = '\n';  break;
108         case 'r':  *d++ = '\r';  break;
109         case 't':  *d++ = '\t';  break;
110         case 'v':  *d++ = '\v';  break;
111         case '\\': *d++ = '\\';  break;
112         case '?':  *d++ = '\?';  break;    // \?  Who knew?
113         case '\'': *d++ = '\'';  break;
114         case '"':  *d++ = '\"';  break;
115         case '0':
116         case '1':
117         case '2':
118         case '3':
119         case '4':
120         case '5':
121         case '6':
122         case '7': {
123           // octal digit: 1 to 3 digits
124           const char* octal_start = p;
125           unsigned int ch = static_cast<unsigned int>(*p - '0');  // digit 1
126           if (p < last_byte && is_octal_digit(p[1]))
127             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 2
128           if (p < last_byte && is_octal_digit(p[1]))
129             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 3
130           if (ch > 0xff) {
131             if (error) {
132               *error = "Value of \\" +
133                        std::string(octal_start,
134                                    static_cast<size_t>(p + 1 - octal_start)) +
135                        " exceeds 0xff";
136             }
137             return false;
138           }
139           if ((ch == 0) && leave_nulls_escaped) {
140             // Copy the escape sequence for the null character
141             const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
142             *d++ = '\\';
143             memmove(d, octal_start, octal_size);
144             d += octal_size;
145             break;
146           }
147           *d++ = static_cast<char>(ch);
148           break;
149         }
150         case 'x':
151         case 'X': {
152           if (p >= last_byte) {
153             if (error) *error = "String cannot end with \\x";
154             return false;
155           } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
156             if (error) *error = "\\x cannot be followed by a non-hex digit";
157             return false;
158           }
159           unsigned int ch = 0;
160           const char* hex_start = p;
161           while (p < last_byte &&
162                  absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
163             // Arbitrarily many hex digits
164             ch = (ch << 4) + hex_digit_to_int(*++p);
165           if (ch > 0xFF) {
166             if (error) {
167               *error = "Value of \\" +
168                        std::string(hex_start,
169                                    static_cast<size_t>(p + 1 - hex_start)) +
170                        " exceeds 0xff";
171             }
172             return false;
173           }
174           if ((ch == 0) && leave_nulls_escaped) {
175             // Copy the escape sequence for the null character
176             const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
177             *d++ = '\\';
178             memmove(d, hex_start, hex_size);
179             d += hex_size;
180             break;
181           }
182           *d++ = static_cast<char>(ch);
183           break;
184         }
185         case 'u': {
186           // \uhhhh => convert 4 hex digits to UTF-8
187           char32_t rune = 0;
188           const char* hex_start = p;
189           if (p + 4 >= end) {
190             if (error) {
191               *error = "\\u must be followed by 4 hex digits: \\" +
192                        std::string(hex_start,
193                                    static_cast<size_t>(p + 1 - hex_start));
194             }
195             return false;
196           }
197           for (int i = 0; i < 4; ++i) {
198             // Look one char ahead.
199             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
200               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
201             } else {
202               if (error) {
203                 *error = "\\u must be followed by 4 hex digits: \\" +
204                          std::string(hex_start,
205                                      static_cast<size_t>(p + 1 - hex_start));
206               }
207               return false;
208             }
209           }
210           if ((rune == 0) && leave_nulls_escaped) {
211             // Copy the escape sequence for the null character
212             *d++ = '\\';
213             memmove(d, hex_start, 5);  // u0000
214             d += 5;
215             break;
216           }
217           if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
218             return false;
219           }
220           d += strings_internal::EncodeUTF8Char(d, rune);
221           break;
222         }
223         case 'U': {
224           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
225           char32_t rune = 0;
226           const char* hex_start = p;
227           if (p + 8 >= end) {
228             if (error) {
229               *error = "\\U must be followed by 8 hex digits: \\" +
230                        std::string(hex_start,
231                                    static_cast<size_t>(p + 1 - hex_start));
232             }
233             return false;
234           }
235           for (int i = 0; i < 8; ++i) {
236             // Look one char ahead.
237             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
238               // Don't change rune until we're sure this
239               // is within the Unicode limit, but do advance p.
240               uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
241               if (newrune > 0x10FFFF) {
242                 if (error) {
243                   *error = "Value of \\" +
244                            std::string(hex_start,
245                                        static_cast<size_t>(p + 1 - hex_start)) +
246                            " exceeds Unicode limit (0x10FFFF)";
247                 }
248                 return false;
249               } else {
250                 rune = newrune;
251               }
252             } else {
253               if (error) {
254                 *error = "\\U must be followed by 8 hex digits: \\" +
255                          std::string(hex_start,
256                                      static_cast<size_t>(p + 1 - hex_start));
257               }
258               return false;
259             }
260           }
261           if ((rune == 0) && leave_nulls_escaped) {
262             // Copy the escape sequence for the null character
263             *d++ = '\\';
264             memmove(d, hex_start, 9);  // U00000000
265             d += 9;
266             break;
267           }
268           if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
269             return false;
270           }
271           d += strings_internal::EncodeUTF8Char(d, rune);
272           break;
273         }
274         default: {
275           if (error) *error = std::string("Unknown escape sequence: \\") + *p;
276           return false;
277         }
278       }
279       p++;                                 // read past letter we escaped
280     }
281   }
282   *dest_len = d - dest;
283   return true;
284 }
285 
286 // ----------------------------------------------------------------------
287 // CUnescapeInternal()
288 //
289 //    Same as above but uses a std::string for output. 'source' and 'dest'
290 //    may be the same.
291 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,std::string * dest,std::string * error)292 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
293                        std::string* dest, std::string* error) {
294   strings_internal::STLStringResizeUninitialized(dest, source.size());
295 
296   ptrdiff_t dest_size;
297   if (!CUnescapeInternal(source,
298                          leave_nulls_escaped,
299                          &(*dest)[0],
300                          &dest_size,
301                          error)) {
302     return false;
303   }
304   dest->erase(static_cast<size_t>(dest_size));
305   return true;
306 }
307 
308 // ----------------------------------------------------------------------
309 // CEscape()
310 // CHexEscape()
311 // Utf8SafeCEscape()
312 // Utf8SafeCHexEscape()
313 //    Escapes 'src' using C-style escape sequences.  This is useful for
314 //    preparing query flags.  The 'Hex' version uses hexadecimal rather than
315 //    octal sequences.  The 'Utf8Safe' version does not touch UTF-8 bytes.
316 //
317 //    Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
318 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)319 std::string CEscapeInternal(absl::string_view src, bool use_hex,
320                             bool utf8_safe) {
321   std::string dest;
322   bool last_hex_escape = false;  // true if last output char was \xNN.
323 
324   for (char c : src) {
325     bool is_hex_escape = false;
326     switch (c) {
327       case '\n': dest.append("\\" "n"); break;
328       case '\r': dest.append("\\" "r"); break;
329       case '\t': dest.append("\\" "t"); break;
330       case '\"': dest.append("\\" "\""); break;
331       case '\'': dest.append("\\" "'"); break;
332       case '\\': dest.append("\\" "\\"); break;
333       default: {
334         // Note that if we emit \xNN and the src character after that is a hex
335         // digit then that digit must be escaped too to prevent it being
336         // interpreted as part of the character code by C.
337         const unsigned char uc = static_cast<unsigned char>(c);
338         if ((!utf8_safe || uc < 0x80) &&
339             (!absl::ascii_isprint(uc) ||
340              (last_hex_escape && absl::ascii_isxdigit(uc)))) {
341           if (use_hex) {
342             dest.append("\\" "x");
343             dest.push_back(numbers_internal::kHexChar[uc / 16]);
344             dest.push_back(numbers_internal::kHexChar[uc % 16]);
345             is_hex_escape = true;
346           } else {
347             dest.append("\\");
348             dest.push_back(numbers_internal::kHexChar[uc / 64]);
349             dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
350             dest.push_back(numbers_internal::kHexChar[uc % 8]);
351           }
352         } else {
353           dest.push_back(c);
354           break;
355         }
356       }
357     }
358     last_hex_escape = is_hex_escape;
359   }
360 
361   return dest;
362 }
363 
364 /* clang-format off */
365 constexpr unsigned char c_escaped_len[256] = {
366     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
367     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
368     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
369     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
370     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
371     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
372     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
373     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
374     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
375     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
376     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
377     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
378     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
379     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
380     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
381     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
382 };
383 /* clang-format on */
384 
385 // Calculates the length of the C-style escaped version of 'src'.
386 // Assumes that non-printable characters are escaped using octal sequences, and
387 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)388 inline size_t CEscapedLength(absl::string_view src) {
389   size_t escaped_len = 0;
390   for (char c : src)
391     escaped_len += c_escaped_len[static_cast<unsigned char>(c)];
392   return escaped_len;
393 }
394 
CEscapeAndAppendInternal(absl::string_view src,std::string * dest)395 void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
396   size_t escaped_len = CEscapedLength(src);
397   if (escaped_len == src.size()) {
398     dest->append(src.data(), src.size());
399     return;
400   }
401 
402   size_t cur_dest_len = dest->size();
403   strings_internal::STLStringResizeUninitialized(dest,
404                                                  cur_dest_len + escaped_len);
405   char* append_ptr = &(*dest)[cur_dest_len];
406 
407   for (char c : src) {
408     size_t char_len = c_escaped_len[static_cast<unsigned char>(c)];
409     if (char_len == 1) {
410       *append_ptr++ = c;
411     } else if (char_len == 2) {
412       switch (c) {
413         case '\n':
414           *append_ptr++ = '\\';
415           *append_ptr++ = 'n';
416           break;
417         case '\r':
418           *append_ptr++ = '\\';
419           *append_ptr++ = 'r';
420           break;
421         case '\t':
422           *append_ptr++ = '\\';
423           *append_ptr++ = 't';
424           break;
425         case '\"':
426           *append_ptr++ = '\\';
427           *append_ptr++ = '\"';
428           break;
429         case '\'':
430           *append_ptr++ = '\\';
431           *append_ptr++ = '\'';
432           break;
433         case '\\':
434           *append_ptr++ = '\\';
435           *append_ptr++ = '\\';
436           break;
437       }
438     } else {
439       *append_ptr++ = '\\';
440       *append_ptr++ = '0' + static_cast<unsigned char>(c) / 64;
441       *append_ptr++ = '0' + (static_cast<unsigned char>(c) % 64) / 8;
442       *append_ptr++ = '0' + static_cast<unsigned char>(c) % 8;
443     }
444   }
445 }
446 
447 // Reverses the mapping in Base64EscapeInternal; see that method's
448 // documentation for details of the mapping.
Base64UnescapeInternal(const char * src_param,size_t szsrc,char * dest,size_t szdest,const signed char * unbase64,size_t * len)449 bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
450                             size_t szdest, const signed char* unbase64,
451                             size_t* len) {
452   static const char kPad64Equals = '=';
453   static const char kPad64Dot = '.';
454 
455   size_t destidx = 0;
456   int decode = 0;
457   int state = 0;
458   unsigned char ch = 0;
459   unsigned int temp = 0;
460 
461   // If "char" is signed by default, using *src as an array index results in
462   // accessing negative array elements. Treat the input as a pointer to
463   // unsigned char to avoid this.
464   const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
465 
466   // The GET_INPUT macro gets the next input character, skipping
467   // over any whitespace, and stopping when we reach the end of the
468   // string or when we read any non-data character.  The arguments are
469   // an arbitrary identifier (used as a label for goto) and the number
470   // of data bytes that must remain in the input to avoid aborting the
471   // loop.
472 #define GET_INPUT(label, remain)                                \
473   label:                                                        \
474   --szsrc;                                                      \
475   ch = *src++;                                                  \
476   decode = unbase64[ch];                                        \
477   if (decode < 0) {                                             \
478     if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
479     state = 4 - remain;                                         \
480     break;                                                      \
481   }
482 
483   // if dest is null, we're just checking to see if it's legal input
484   // rather than producing output.  (I suspect this could just be done
485   // with a regexp...).  We duplicate the loop so this test can be
486   // outside it instead of in every iteration.
487 
488   if (dest) {
489     // This loop consumes 4 input bytes and produces 3 output bytes
490     // per iteration.  We can't know at the start that there is enough
491     // data left in the string for a full iteration, so the loop may
492     // break out in the middle; if so 'state' will be set to the
493     // number of input bytes read.
494 
495     while (szsrc >= 4) {
496       // We'll start by optimistically assuming that the next four
497       // bytes of the string (src[0..3]) are four good data bytes
498       // (that is, no nulls, whitespace, padding chars, or illegal
499       // chars).  We need to test src[0..2] for nulls individually
500       // before constructing temp to preserve the property that we
501       // never read past a null in the string (no matter how long
502       // szsrc claims the string is).
503 
504       if (!src[0] || !src[1] || !src[2] ||
505           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
506                     (unsigned(unbase64[src[1]]) << 12) |
507                     (unsigned(unbase64[src[2]]) << 6) |
508                     (unsigned(unbase64[src[3]])))) &
509            0x80000000)) {
510         // Iff any of those four characters was bad (null, illegal,
511         // whitespace, padding), then temp's high bit will be set
512         // (because unbase64[] is -1 for all bad characters).
513         //
514         // We'll back up and resort to the slower decoder, which knows
515         // how to handle those cases.
516 
517         GET_INPUT(first, 4);
518         temp = static_cast<unsigned char>(decode);
519         GET_INPUT(second, 3);
520         temp = (temp << 6) | static_cast<unsigned char>(decode);
521         GET_INPUT(third, 2);
522         temp = (temp << 6) | static_cast<unsigned char>(decode);
523         GET_INPUT(fourth, 1);
524         temp = (temp << 6) | static_cast<unsigned char>(decode);
525       } else {
526         // We really did have four good data bytes, so advance four
527         // characters in the string.
528 
529         szsrc -= 4;
530         src += 4;
531       }
532 
533       // temp has 24 bits of input, so write that out as three bytes.
534 
535       if (destidx + 3 > szdest) return false;
536       dest[destidx + 2] = static_cast<char>(temp);
537       temp >>= 8;
538       dest[destidx + 1] = static_cast<char>(temp);
539       temp >>= 8;
540       dest[destidx] = static_cast<char>(temp);
541       destidx += 3;
542     }
543   } else {
544     while (szsrc >= 4) {
545       if (!src[0] || !src[1] || !src[2] ||
546           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
547                     (unsigned(unbase64[src[1]]) << 12) |
548                     (unsigned(unbase64[src[2]]) << 6) |
549                     (unsigned(unbase64[src[3]])))) &
550            0x80000000)) {
551         GET_INPUT(first_no_dest, 4);
552         GET_INPUT(second_no_dest, 3);
553         GET_INPUT(third_no_dest, 2);
554         GET_INPUT(fourth_no_dest, 1);
555       } else {
556         szsrc -= 4;
557         src += 4;
558       }
559       destidx += 3;
560     }
561   }
562 
563 #undef GET_INPUT
564 
565   // if the loop terminated because we read a bad character, return
566   // now.
567   if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
568       !absl::ascii_isspace(ch))
569     return false;
570 
571   if (ch == kPad64Equals || ch == kPad64Dot) {
572     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
573     // look at it again when we count to check for the proper number of
574     // equals signs at the end.
575     ++szsrc;
576     --src;
577   } else {
578     // This loop consumes 1 input byte per iteration.  It's used to
579     // clean up the 0-3 input bytes remaining when the first, faster
580     // loop finishes.  'temp' contains the data from 'state' input
581     // characters read by the first loop.
582     while (szsrc > 0) {
583       --szsrc;
584       ch = *src++;
585       decode = unbase64[ch];
586       if (decode < 0) {
587         if (absl::ascii_isspace(ch)) {
588           continue;
589         } else if (ch == kPad64Equals || ch == kPad64Dot) {
590           // back up one character; we'll read it again when we check
591           // for the correct number of pad characters at the end.
592           ++szsrc;
593           --src;
594           break;
595         } else {
596           return false;
597         }
598       }
599 
600       // Each input character gives us six bits of output.
601       temp = (temp << 6) | static_cast<unsigned char>(decode);
602       ++state;
603       if (state == 4) {
604         // If we've accumulated 24 bits of output, write that out as
605         // three bytes.
606         if (dest) {
607           if (destidx + 3 > szdest) return false;
608           dest[destidx + 2] = static_cast<char>(temp);
609           temp >>= 8;
610           dest[destidx + 1] = static_cast<char>(temp);
611           temp >>= 8;
612           dest[destidx] = static_cast<char>(temp);
613         }
614         destidx += 3;
615         state = 0;
616         temp = 0;
617       }
618     }
619   }
620 
621   // Process the leftover data contained in 'temp' at the end of the input.
622   int expected_equals = 0;
623   switch (state) {
624     case 0:
625       // Nothing left over; output is a multiple of 3 bytes.
626       break;
627 
628     case 1:
629       // Bad input; we have 6 bits left over.
630       return false;
631 
632     case 2:
633       // Produce one more output byte from the 12 input bits we have left.
634       if (dest) {
635         if (destidx + 1 > szdest) return false;
636         temp >>= 4;
637         dest[destidx] = static_cast<char>(temp);
638       }
639       ++destidx;
640       expected_equals = 2;
641       break;
642 
643     case 3:
644       // Produce two more output bytes from the 18 input bits we have left.
645       if (dest) {
646         if (destidx + 2 > szdest) return false;
647         temp >>= 2;
648         dest[destidx + 1] = static_cast<char>(temp);
649         temp >>= 8;
650         dest[destidx] = static_cast<char>(temp);
651       }
652       destidx += 2;
653       expected_equals = 1;
654       break;
655 
656     default:
657       // state should have no other values at this point.
658       ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
659                    state);
660   }
661 
662   // The remainder of the string should be all whitespace, mixed with
663   // exactly 0 equals signs, or exactly 'expected_equals' equals
664   // signs.  (Always accepting 0 equals signs is an Abseil extension
665   // not covered in the RFC, as is accepting dot as the pad character.)
666 
667   int equals = 0;
668   while (szsrc > 0) {
669     if (*src == kPad64Equals || *src == kPad64Dot)
670       ++equals;
671     else if (!absl::ascii_isspace(*src))
672       return false;
673     --szsrc;
674     ++src;
675   }
676 
677   const bool ok = (equals == 0 || equals == expected_equals);
678   if (ok) *len = destidx;
679   return ok;
680 }
681 
682 // The arrays below map base64-escaped characters back to their original values.
683 // For the inverse case, see k(WebSafe)Base64Chars in the internal
684 // escaping.cc.
685 // These arrays were generated by the following inversion code:
686 // #include <sys/time.h>
687 // #include <stdlib.h>
688 // #include <string.h>
689 // main()
690 // {
691 //   static const char Base64[] =
692 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
693 //   char* pos;
694 //   int idx, i, j;
695 //   printf("    ");
696 //   for (i = 0; i < 255; i += 8) {
697 //     for (j = i; j < i + 8; j++) {
698 //       pos = strchr(Base64, j);
699 //       if ((pos == nullptr) || (j == 0))
700 //         idx = -1;
701 //       else
702 //         idx = pos - Base64;
703 //       if (idx == -1)
704 //         printf(" %2d,     ", idx);
705 //       else
706 //         printf(" %2d/*%c*/,", idx, j);
707 //     }
708 //     printf("\n    ");
709 //   }
710 // }
711 //
712 // where the value of "Base64[]" was replaced by one of k(WebSafe)Base64Chars
713 // in the internal escaping.cc.
714 /* clang-format off */
715 constexpr signed char kUnBase64[] = {
716     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
717     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
718     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
719     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
720     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
721     -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
722     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
723     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
724     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
725     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
726     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
727     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
728     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
729     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
730     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
731     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
732     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
733     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
734     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
735     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
736     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
737     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
738     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
739     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
740     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
741     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
742     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
743     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
744     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
745     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
746     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
747     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
748 };
749 
750 constexpr signed char kUnWebSafeBase64[] = {
751     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
752     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
753     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
754     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
755     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
756     -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
757     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
758     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
759     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
760     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
761     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
762     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
763     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
764     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
765     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
766     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
767     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
768     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
769     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
770     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
771     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
772     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
773     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
774     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
775     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
776     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
777     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
778     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
779     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
780     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
781     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
782     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
783 };
784 /* clang-format on */
785 
786 template <typename String>
Base64UnescapeInternal(const char * src,size_t slen,String * dest,const signed char * unbase64)787 bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
788                             const signed char* unbase64) {
789   // Determine the size of the output string.  Base64 encodes every 3 bytes into
790   // 4 characters.  Any leftover chars are added directly for good measure.
791   const size_t dest_len = 3 * (slen / 4) + (slen % 4);
792 
793   strings_internal::STLStringResizeUninitialized(dest, dest_len);
794 
795   // We are getting the destination buffer by getting the beginning of the
796   // string and converting it into a char *.
797   size_t len;
798   const bool ok =
799       Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
800   if (!ok) {
801     dest->clear();
802     return false;
803   }
804 
805   // could be shorter if there was padding
806   assert(len <= dest_len);
807   dest->erase(len);
808 
809   return true;
810 }
811 
812 /* clang-format off */
813 constexpr char kHexValueLenient[256] = {
814     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
817     0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0,  // '0'..'9'
818     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'A'..'F'
819     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
820     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'a'..'f'
821     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
822     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
823     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
824     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
825     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
826     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
827     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
828     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
829     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
830 };
831 
832 /* clang-format on */
833 
834 // This is a templated function so that T can be either a char*
835 // or a string.  This works because we use the [] operator to access
836 // individual characters at a time.
837 template <typename T>
HexStringToBytesInternal(const char * from,T to,size_t num)838 void HexStringToBytesInternal(const char* from, T to, size_t num) {
839   for (size_t i = 0; i < num; i++) {
840     to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
841             (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
842   }
843 }
844 
845 // This is a templated function so that T can be either a char* or a
846 // std::string.
847 template <typename T>
BytesToHexStringInternal(const unsigned char * src,T dest,size_t num)848 void BytesToHexStringInternal(const unsigned char* src, T dest, size_t num) {
849   auto dest_ptr = &dest[0];
850   for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
851     const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
852     std::copy(hex_p, hex_p + 2, dest_ptr);
853   }
854 }
855 
856 }  // namespace
857 
858 // ----------------------------------------------------------------------
859 // CUnescape()
860 //
861 // See CUnescapeInternal() for implementation details.
862 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,std::string * dest,std::string * error)863 bool CUnescape(absl::string_view source, std::string* dest,
864                std::string* error) {
865   return CUnescapeInternal(source, kUnescapeNulls, dest, error);
866 }
867 
CEscape(absl::string_view src)868 std::string CEscape(absl::string_view src) {
869   std::string dest;
870   CEscapeAndAppendInternal(src, &dest);
871   return dest;
872 }
873 
CHexEscape(absl::string_view src)874 std::string CHexEscape(absl::string_view src) {
875   return CEscapeInternal(src, true, false);
876 }
877 
Utf8SafeCEscape(absl::string_view src)878 std::string Utf8SafeCEscape(absl::string_view src) {
879   return CEscapeInternal(src, false, true);
880 }
881 
Utf8SafeCHexEscape(absl::string_view src)882 std::string Utf8SafeCHexEscape(absl::string_view src) {
883   return CEscapeInternal(src, true, true);
884 }
885 
Base64Unescape(absl::string_view src,std::string * dest)886 bool Base64Unescape(absl::string_view src, std::string* dest) {
887   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
888 }
889 
WebSafeBase64Unescape(absl::string_view src,std::string * dest)890 bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
891   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
892 }
893 
Base64Escape(absl::string_view src,std::string * dest)894 void Base64Escape(absl::string_view src, std::string* dest) {
895   strings_internal::Base64EscapeInternal(
896       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
897       true, strings_internal::kBase64Chars);
898 }
899 
WebSafeBase64Escape(absl::string_view src,std::string * dest)900 void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
901   strings_internal::Base64EscapeInternal(
902       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
903       false, strings_internal::kWebSafeBase64Chars);
904 }
905 
Base64Escape(absl::string_view src)906 std::string Base64Escape(absl::string_view src) {
907   std::string dest;
908   strings_internal::Base64EscapeInternal(
909       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
910       true, strings_internal::kBase64Chars);
911   return dest;
912 }
913 
WebSafeBase64Escape(absl::string_view src)914 std::string WebSafeBase64Escape(absl::string_view src) {
915   std::string dest;
916   strings_internal::Base64EscapeInternal(
917       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
918       false, strings_internal::kWebSafeBase64Chars);
919   return dest;
920 }
921 
HexStringToBytes(absl::string_view from)922 std::string HexStringToBytes(absl::string_view from) {
923   std::string result;
924   const auto num = from.size() / 2;
925   strings_internal::STLStringResizeUninitialized(&result, num);
926   absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
927   return result;
928 }
929 
BytesToHexString(absl::string_view from)930 std::string BytesToHexString(absl::string_view from) {
931   std::string result;
932   strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
933   absl::BytesToHexStringInternal<std::string&>(
934       reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
935   return result;
936 }
937 
938 ABSL_NAMESPACE_END
939 }  // namespace absl
940