• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "absl/strings/escaping.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cstdint>
20 #include <cstring>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24 
25 #include "absl/base/internal/endian.h"
26 #include "absl/base/internal/raw_logging.h"
27 #include "absl/base/internal/unaligned_access.h"
28 #include "absl/strings/internal/char_map.h"
29 #include "absl/strings/internal/escaping.h"
30 #include "absl/strings/internal/resize_uninitialized.h"
31 #include "absl/strings/internal/utf8.h"
32 #include "absl/strings/str_cat.h"
33 #include "absl/strings/str_join.h"
34 #include "absl/strings/string_view.h"
35 
36 namespace absl {
37 ABSL_NAMESPACE_BEGIN
38 namespace {
39 
40 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
41 constexpr bool kUnescapeNulls = false;
42 
is_octal_digit(char c)43 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
44 
hex_digit_to_int(char c)45 inline unsigned int hex_digit_to_int(char c) {
46   static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
47                 "Character set must be ASCII.");
48   assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
49   unsigned int x = static_cast<unsigned char>(c);
50   if (x > '9') {
51     x += 9;
52   }
53   return x & 0xf;
54 }
55 
IsSurrogate(char32_t c,absl::string_view src,std::string * error)56 inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
57   if (c >= 0xD800 && c <= 0xDFFF) {
58     if (error) {
59       *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
60                             src);
61     }
62     return true;
63   }
64   return false;
65 }
66 
67 // ----------------------------------------------------------------------
68 // CUnescapeInternal()
69 //    Implements both CUnescape() and CUnescapeForNullTerminatedString().
70 //
71 //    Unescapes C escape sequences and is the reverse of CEscape().
72 //
73 //    If 'source' is valid, stores the unescaped string and its size in
74 //    'dest' and 'dest_len' respectively, and returns true. Otherwise
75 //    returns false and optionally stores the error description in
76 //    'error'. Set 'error' to nullptr to disable error reporting.
77 //
78 //    'dest' should point to a buffer that is at least as big as 'source'.
79 //    'source' and 'dest' may be the same.
80 //
81 //     NOTE: any changes to this function must also be reflected in the older
82 //     UnescapeCEscapeSequences().
83 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,char * dest,ptrdiff_t * dest_len,std::string * error)84 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
85                        char* dest, ptrdiff_t* dest_len, std::string* error) {
86   char* d = dest;
87   const char* p = source.data();
88   const char* end = p + source.size();
89   const char* last_byte = end - 1;
90 
91   // Small optimization for case where source = dest and there's no escaping
92   while (p == d && p < end && *p != '\\') p++, d++;
93 
94   while (p < end) {
95     if (*p != '\\') {
96       *d++ = *p++;
97     } else {
98       if (++p > last_byte) {  // skip past the '\\'
99         if (error) *error = "String cannot end with \\";
100         return false;
101       }
102       switch (*p) {
103         case 'a':  *d++ = '\a';  break;
104         case 'b':  *d++ = '\b';  break;
105         case 'f':  *d++ = '\f';  break;
106         case 'n':  *d++ = '\n';  break;
107         case 'r':  *d++ = '\r';  break;
108         case 't':  *d++ = '\t';  break;
109         case 'v':  *d++ = '\v';  break;
110         case '\\': *d++ = '\\';  break;
111         case '?':  *d++ = '\?';  break;    // \?  Who knew?
112         case '\'': *d++ = '\'';  break;
113         case '"':  *d++ = '\"';  break;
114         case '0':
115         case '1':
116         case '2':
117         case '3':
118         case '4':
119         case '5':
120         case '6':
121         case '7': {
122           // octal digit: 1 to 3 digits
123           const char* octal_start = p;
124           unsigned int ch = static_cast<unsigned int>(*p - '0');  // digit 1
125           if (p < last_byte && is_octal_digit(p[1]))
126             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 2
127           if (p < last_byte && is_octal_digit(p[1]))
128             ch = ch * 8 + static_cast<unsigned int>(*++p - '0');  // digit 3
129           if (ch > 0xff) {
130             if (error) {
131               *error = "Value of \\" +
132                        std::string(octal_start,
133                                    static_cast<size_t>(p + 1 - octal_start)) +
134                        " exceeds 0xff";
135             }
136             return false;
137           }
138           if ((ch == 0) && leave_nulls_escaped) {
139             // Copy the escape sequence for the null character
140             const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
141             *d++ = '\\';
142             memmove(d, octal_start, octal_size);
143             d += octal_size;
144             break;
145           }
146           *d++ = static_cast<char>(ch);
147           break;
148         }
149         case 'x':
150         case 'X': {
151           if (p >= last_byte) {
152             if (error) *error = "String cannot end with \\x";
153             return false;
154           } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
155             if (error) *error = "\\x cannot be followed by a non-hex digit";
156             return false;
157           }
158           unsigned int ch = 0;
159           const char* hex_start = p;
160           while (p < last_byte &&
161                  absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
162             // Arbitrarily many hex digits
163             ch = (ch << 4) + hex_digit_to_int(*++p);
164           if (ch > 0xFF) {
165             if (error) {
166               *error = "Value of \\" +
167                        std::string(hex_start,
168                                    static_cast<size_t>(p + 1 - hex_start)) +
169                        " exceeds 0xff";
170             }
171             return false;
172           }
173           if ((ch == 0) && leave_nulls_escaped) {
174             // Copy the escape sequence for the null character
175             const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
176             *d++ = '\\';
177             memmove(d, hex_start, hex_size);
178             d += hex_size;
179             break;
180           }
181           *d++ = static_cast<char>(ch);
182           break;
183         }
184         case 'u': {
185           // \uhhhh => convert 4 hex digits to UTF-8
186           char32_t rune = 0;
187           const char* hex_start = p;
188           if (p + 4 >= end) {
189             if (error) {
190               *error = "\\u must be followed by 4 hex digits: \\" +
191                        std::string(hex_start,
192                                    static_cast<size_t>(p + 1 - hex_start));
193             }
194             return false;
195           }
196           for (int i = 0; i < 4; ++i) {
197             // Look one char ahead.
198             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
199               rune = (rune << 4) + hex_digit_to_int(*++p);  // Advance p.
200             } else {
201               if (error) {
202                 *error = "\\u must be followed by 4 hex digits: \\" +
203                          std::string(hex_start,
204                                      static_cast<size_t>(p + 1 - hex_start));
205               }
206               return false;
207             }
208           }
209           if ((rune == 0) && leave_nulls_escaped) {
210             // Copy the escape sequence for the null character
211             *d++ = '\\';
212             memmove(d, hex_start, 5);  // u0000
213             d += 5;
214             break;
215           }
216           if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
217             return false;
218           }
219           d += strings_internal::EncodeUTF8Char(d, rune);
220           break;
221         }
222         case 'U': {
223           // \Uhhhhhhhh => convert 8 hex digits to UTF-8
224           char32_t rune = 0;
225           const char* hex_start = p;
226           if (p + 8 >= end) {
227             if (error) {
228               *error = "\\U must be followed by 8 hex digits: \\" +
229                        std::string(hex_start,
230                                    static_cast<size_t>(p + 1 - hex_start));
231             }
232             return false;
233           }
234           for (int i = 0; i < 8; ++i) {
235             // Look one char ahead.
236             if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
237               // Don't change rune until we're sure this
238               // is within the Unicode limit, but do advance p.
239               uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
240               if (newrune > 0x10FFFF) {
241                 if (error) {
242                   *error = "Value of \\" +
243                            std::string(hex_start,
244                                        static_cast<size_t>(p + 1 - hex_start)) +
245                            " exceeds Unicode limit (0x10FFFF)";
246                 }
247                 return false;
248               } else {
249                 rune = newrune;
250               }
251             } else {
252               if (error) {
253                 *error = "\\U must be followed by 8 hex digits: \\" +
254                          std::string(hex_start,
255                                      static_cast<size_t>(p + 1 - hex_start));
256               }
257               return false;
258             }
259           }
260           if ((rune == 0) && leave_nulls_escaped) {
261             // Copy the escape sequence for the null character
262             *d++ = '\\';
263             memmove(d, hex_start, 9);  // U00000000
264             d += 9;
265             break;
266           }
267           if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
268             return false;
269           }
270           d += strings_internal::EncodeUTF8Char(d, rune);
271           break;
272         }
273         default: {
274           if (error) *error = std::string("Unknown escape sequence: \\") + *p;
275           return false;
276         }
277       }
278       p++;                                 // read past letter we escaped
279     }
280   }
281   *dest_len = d - dest;
282   return true;
283 }
284 
285 // ----------------------------------------------------------------------
286 // CUnescapeInternal()
287 //
288 //    Same as above but uses a std::string for output. 'source' and 'dest'
289 //    may be the same.
290 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,std::string * dest,std::string * error)291 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
292                        std::string* dest, std::string* error) {
293   strings_internal::STLStringResizeUninitialized(dest, source.size());
294 
295   ptrdiff_t dest_size;
296   if (!CUnescapeInternal(source,
297                          leave_nulls_escaped,
298                          &(*dest)[0],
299                          &dest_size,
300                          error)) {
301     return false;
302   }
303   dest->erase(static_cast<size_t>(dest_size));
304   return true;
305 }
306 
307 // ----------------------------------------------------------------------
308 // CEscape()
309 // CHexEscape()
310 // Utf8SafeCEscape()
311 // Utf8SafeCHexEscape()
312 //    Escapes 'src' using C-style escape sequences.  This is useful for
313 //    preparing query flags.  The 'Hex' version uses hexadecimal rather than
314 //    octal sequences.  The 'Utf8Safe' version does not touch UTF-8 bytes.
315 //
316 //    Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
317 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)318 std::string CEscapeInternal(absl::string_view src, bool use_hex,
319                             bool utf8_safe) {
320   std::string dest;
321   bool last_hex_escape = false;  // true if last output char was \xNN.
322 
323   for (char c : src) {
324     bool is_hex_escape = false;
325     switch (c) {
326       case '\n': dest.append("\\" "n"); break;
327       case '\r': dest.append("\\" "r"); break;
328       case '\t': dest.append("\\" "t"); break;
329       case '\"': dest.append("\\" "\""); break;
330       case '\'': dest.append("\\" "'"); break;
331       case '\\': dest.append("\\" "\\"); break;
332       default: {
333         // Note that if we emit \xNN and the src character after that is a hex
334         // digit then that digit must be escaped too to prevent it being
335         // interpreted as part of the character code by C.
336         const unsigned char uc = static_cast<unsigned char>(c);
337         if ((!utf8_safe || uc < 0x80) &&
338             (!absl::ascii_isprint(uc) ||
339              (last_hex_escape && absl::ascii_isxdigit(uc)))) {
340           if (use_hex) {
341             dest.append("\\" "x");
342             dest.push_back(numbers_internal::kHexChar[uc / 16]);
343             dest.push_back(numbers_internal::kHexChar[uc % 16]);
344             is_hex_escape = true;
345           } else {
346             dest.append("\\");
347             dest.push_back(numbers_internal::kHexChar[uc / 64]);
348             dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
349             dest.push_back(numbers_internal::kHexChar[uc % 8]);
350           }
351         } else {
352           dest.push_back(c);
353           break;
354         }
355       }
356     }
357     last_hex_escape = is_hex_escape;
358   }
359 
360   return dest;
361 }
362 
363 /* clang-format off */
364 constexpr unsigned char c_escaped_len[256] = {
365     4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4,  // \t, \n, \r
366     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
367     1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,  // ", '
368     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // '0'..'9'
369     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'A'..'O'
370     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1,  // 'P'..'Z', '\'
371     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  // 'a'..'o'
372     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4,  // 'p'..'z', DEL
373     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
374     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
375     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
376     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
377     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
378     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
379     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
380     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
381 };
382 /* clang-format on */
383 
384 // Calculates the length of the C-style escaped version of 'src'.
385 // Assumes that non-printable characters are escaped using octal sequences, and
386 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)387 inline size_t CEscapedLength(absl::string_view src) {
388   size_t escaped_len = 0;
389   for (char c : src)
390     escaped_len += c_escaped_len[static_cast<unsigned char>(c)];
391   return escaped_len;
392 }
393 
CEscapeAndAppendInternal(absl::string_view src,std::string * dest)394 void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
395   size_t escaped_len = CEscapedLength(src);
396   if (escaped_len == src.size()) {
397     dest->append(src.data(), src.size());
398     return;
399   }
400 
401   size_t cur_dest_len = dest->size();
402   strings_internal::STLStringResizeUninitialized(dest,
403                                                  cur_dest_len + escaped_len);
404   char* append_ptr = &(*dest)[cur_dest_len];
405 
406   for (char c : src) {
407     size_t char_len = c_escaped_len[static_cast<unsigned char>(c)];
408     if (char_len == 1) {
409       *append_ptr++ = c;
410     } else if (char_len == 2) {
411       switch (c) {
412         case '\n':
413           *append_ptr++ = '\\';
414           *append_ptr++ = 'n';
415           break;
416         case '\r':
417           *append_ptr++ = '\\';
418           *append_ptr++ = 'r';
419           break;
420         case '\t':
421           *append_ptr++ = '\\';
422           *append_ptr++ = 't';
423           break;
424         case '\"':
425           *append_ptr++ = '\\';
426           *append_ptr++ = '\"';
427           break;
428         case '\'':
429           *append_ptr++ = '\\';
430           *append_ptr++ = '\'';
431           break;
432         case '\\':
433           *append_ptr++ = '\\';
434           *append_ptr++ = '\\';
435           break;
436       }
437     } else {
438       *append_ptr++ = '\\';
439       *append_ptr++ = '0' + static_cast<unsigned char>(c) / 64;
440       *append_ptr++ = '0' + (static_cast<unsigned char>(c) % 64) / 8;
441       *append_ptr++ = '0' + static_cast<unsigned char>(c) % 8;
442     }
443   }
444 }
445 
446 // Reverses the mapping in Base64EscapeInternal; see that method's
447 // documentation for details of the mapping.
Base64UnescapeInternal(const char * src_param,size_t szsrc,char * dest,size_t szdest,const signed char * unbase64,size_t * len)448 bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
449                             size_t szdest, const signed char* unbase64,
450                             size_t* len) {
451   static const char kPad64Equals = '=';
452   static const char kPad64Dot = '.';
453 
454   size_t destidx = 0;
455   int decode = 0;
456   int state = 0;
457   unsigned char ch = 0;
458   unsigned int temp = 0;
459 
460   // If "char" is signed by default, using *src as an array index results in
461   // accessing negative array elements. Treat the input as a pointer to
462   // unsigned char to avoid this.
463   const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
464 
465   // The GET_INPUT macro gets the next input character, skipping
466   // over any whitespace, and stopping when we reach the end of the
467   // string or when we read any non-data character.  The arguments are
468   // an arbitrary identifier (used as a label for goto) and the number
469   // of data bytes that must remain in the input to avoid aborting the
470   // loop.
471 #define GET_INPUT(label, remain)                                \
472   label:                                                        \
473   --szsrc;                                                      \
474   ch = *src++;                                                  \
475   decode = unbase64[ch];                                        \
476   if (decode < 0) {                                             \
477     if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
478     state = 4 - remain;                                         \
479     break;                                                      \
480   }
481 
482   // if dest is null, we're just checking to see if it's legal input
483   // rather than producing output.  (I suspect this could just be done
484   // with a regexp...).  We duplicate the loop so this test can be
485   // outside it instead of in every iteration.
486 
487   if (dest) {
488     // This loop consumes 4 input bytes and produces 3 output bytes
489     // per iteration.  We can't know at the start that there is enough
490     // data left in the string for a full iteration, so the loop may
491     // break out in the middle; if so 'state' will be set to the
492     // number of input bytes read.
493 
494     while (szsrc >= 4) {
495       // We'll start by optimistically assuming that the next four
496       // bytes of the string (src[0..3]) are four good data bytes
497       // (that is, no nulls, whitespace, padding chars, or illegal
498       // chars).  We need to test src[0..2] for nulls individually
499       // before constructing temp to preserve the property that we
500       // never read past a null in the string (no matter how long
501       // szsrc claims the string is).
502 
503       if (!src[0] || !src[1] || !src[2] ||
504           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
505                     (unsigned(unbase64[src[1]]) << 12) |
506                     (unsigned(unbase64[src[2]]) << 6) |
507                     (unsigned(unbase64[src[3]])))) &
508            0x80000000)) {
509         // Iff any of those four characters was bad (null, illegal,
510         // whitespace, padding), then temp's high bit will be set
511         // (because unbase64[] is -1 for all bad characters).
512         //
513         // We'll back up and resort to the slower decoder, which knows
514         // how to handle those cases.
515 
516         GET_INPUT(first, 4);
517         temp = static_cast<unsigned char>(decode);
518         GET_INPUT(second, 3);
519         temp = (temp << 6) | static_cast<unsigned char>(decode);
520         GET_INPUT(third, 2);
521         temp = (temp << 6) | static_cast<unsigned char>(decode);
522         GET_INPUT(fourth, 1);
523         temp = (temp << 6) | static_cast<unsigned char>(decode);
524       } else {
525         // We really did have four good data bytes, so advance four
526         // characters in the string.
527 
528         szsrc -= 4;
529         src += 4;
530       }
531 
532       // temp has 24 bits of input, so write that out as three bytes.
533 
534       if (destidx + 3 > szdest) return false;
535       dest[destidx + 2] = static_cast<char>(temp);
536       temp >>= 8;
537       dest[destidx + 1] = static_cast<char>(temp);
538       temp >>= 8;
539       dest[destidx] = static_cast<char>(temp);
540       destidx += 3;
541     }
542   } else {
543     while (szsrc >= 4) {
544       if (!src[0] || !src[1] || !src[2] ||
545           ((temp = ((unsigned(unbase64[src[0]]) << 18) |
546                     (unsigned(unbase64[src[1]]) << 12) |
547                     (unsigned(unbase64[src[2]]) << 6) |
548                     (unsigned(unbase64[src[3]])))) &
549            0x80000000)) {
550         GET_INPUT(first_no_dest, 4);
551         GET_INPUT(second_no_dest, 3);
552         GET_INPUT(third_no_dest, 2);
553         GET_INPUT(fourth_no_dest, 1);
554       } else {
555         szsrc -= 4;
556         src += 4;
557       }
558       destidx += 3;
559     }
560   }
561 
562 #undef GET_INPUT
563 
564   // if the loop terminated because we read a bad character, return
565   // now.
566   if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
567       !absl::ascii_isspace(ch))
568     return false;
569 
570   if (ch == kPad64Equals || ch == kPad64Dot) {
571     // if we stopped by hitting an '=' or '.', un-read that character -- we'll
572     // look at it again when we count to check for the proper number of
573     // equals signs at the end.
574     ++szsrc;
575     --src;
576   } else {
577     // This loop consumes 1 input byte per iteration.  It's used to
578     // clean up the 0-3 input bytes remaining when the first, faster
579     // loop finishes.  'temp' contains the data from 'state' input
580     // characters read by the first loop.
581     while (szsrc > 0) {
582       --szsrc;
583       ch = *src++;
584       decode = unbase64[ch];
585       if (decode < 0) {
586         if (absl::ascii_isspace(ch)) {
587           continue;
588         } else if (ch == kPad64Equals || ch == kPad64Dot) {
589           // back up one character; we'll read it again when we check
590           // for the correct number of pad characters at the end.
591           ++szsrc;
592           --src;
593           break;
594         } else {
595           return false;
596         }
597       }
598 
599       // Each input character gives us six bits of output.
600       temp = (temp << 6) | static_cast<unsigned char>(decode);
601       ++state;
602       if (state == 4) {
603         // If we've accumulated 24 bits of output, write that out as
604         // three bytes.
605         if (dest) {
606           if (destidx + 3 > szdest) return false;
607           dest[destidx + 2] = static_cast<char>(temp);
608           temp >>= 8;
609           dest[destidx + 1] = static_cast<char>(temp);
610           temp >>= 8;
611           dest[destidx] = static_cast<char>(temp);
612         }
613         destidx += 3;
614         state = 0;
615         temp = 0;
616       }
617     }
618   }
619 
620   // Process the leftover data contained in 'temp' at the end of the input.
621   int expected_equals = 0;
622   switch (state) {
623     case 0:
624       // Nothing left over; output is a multiple of 3 bytes.
625       break;
626 
627     case 1:
628       // Bad input; we have 6 bits left over.
629       return false;
630 
631     case 2:
632       // Produce one more output byte from the 12 input bits we have left.
633       if (dest) {
634         if (destidx + 1 > szdest) return false;
635         temp >>= 4;
636         dest[destidx] = static_cast<char>(temp);
637       }
638       ++destidx;
639       expected_equals = 2;
640       break;
641 
642     case 3:
643       // Produce two more output bytes from the 18 input bits we have left.
644       if (dest) {
645         if (destidx + 2 > szdest) return false;
646         temp >>= 2;
647         dest[destidx + 1] = static_cast<char>(temp);
648         temp >>= 8;
649         dest[destidx] = static_cast<char>(temp);
650       }
651       destidx += 2;
652       expected_equals = 1;
653       break;
654 
655     default:
656       // state should have no other values at this point.
657       ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
658                    state);
659   }
660 
661   // The remainder of the string should be all whitespace, mixed with
662   // exactly 0 equals signs, or exactly 'expected_equals' equals
663   // signs.  (Always accepting 0 equals signs is an Abseil extension
664   // not covered in the RFC, as is accepting dot as the pad character.)
665 
666   int equals = 0;
667   while (szsrc > 0) {
668     if (*src == kPad64Equals || *src == kPad64Dot)
669       ++equals;
670     else if (!absl::ascii_isspace(*src))
671       return false;
672     --szsrc;
673     ++src;
674   }
675 
676   const bool ok = (equals == 0 || equals == expected_equals);
677   if (ok) *len = destidx;
678   return ok;
679 }
680 
681 // The arrays below map base64-escaped characters back to their original values.
682 // For the inverse case, see k(WebSafe)Base64Chars in the internal
683 // escaping.cc.
684 // These arrays were generated by the following inversion code:
685 // #include <sys/time.h>
686 // #include <stdlib.h>
687 // #include <string.h>
688 // main()
689 // {
690 //   static const char Base64[] =
691 //     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
692 //   char* pos;
693 //   int idx, i, j;
694 //   printf("    ");
695 //   for (i = 0; i < 255; i += 8) {
696 //     for (j = i; j < i + 8; j++) {
697 //       pos = strchr(Base64, j);
698 //       if ((pos == nullptr) || (j == 0))
699 //         idx = -1;
700 //       else
701 //         idx = pos - Base64;
702 //       if (idx == -1)
703 //         printf(" %2d,     ", idx);
704 //       else
705 //         printf(" %2d/*%c*/,", idx, j);
706 //     }
707 //     printf("\n    ");
708 //   }
709 // }
710 //
711 // where the value of "Base64[]" was replaced by one of k(WebSafe)Base64Chars
712 // in the internal escaping.cc.
713 /* clang-format off */
714 constexpr signed char kUnBase64[] = {
715     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
716     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
717     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
718     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
719     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
720     -1,      -1,      -1,      62/*+*/, -1,      -1,      -1,      63/*/ */,
721     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
722     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
723     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
724     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
725     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
726     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      -1,
727     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
728     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
729     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
730     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
731     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
732     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
733     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
734     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
735     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
736     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
737     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
738     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
739     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
740     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
741     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
742     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
743     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
744     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
745     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
746     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
747 };
748 
749 constexpr signed char kUnWebSafeBase64[] = {
750     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
751     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
752     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
753     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
754     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
755     -1,      -1,      -1,      -1,      -1,      62/*-*/, -1,      -1,
756     52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
757     60/*8*/, 61/*9*/, -1,      -1,      -1,      -1,      -1,      -1,
758     -1,       0/*A*/,  1/*B*/,  2/*C*/,  3/*D*/,  4/*E*/,  5/*F*/,  6/*G*/,
759     07/*H*/,  8/*I*/,  9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
760     15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
761     23/*X*/, 24/*Y*/, 25/*Z*/, -1,      -1,      -1,      -1,      63/*_*/,
762     -1,      26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
763     33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
764     41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
765     49/*x*/, 50/*y*/, 51/*z*/, -1,      -1,      -1,      -1,      -1,
766     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
767     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
768     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
769     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
770     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
771     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
772     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
773     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
774     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
775     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
776     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
777     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
778     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
779     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
780     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1,
781     -1,      -1,      -1,      -1,      -1,      -1,      -1,      -1
782 };
783 /* clang-format on */
784 
785 template <typename String>
Base64UnescapeInternal(const char * src,size_t slen,String * dest,const signed char * unbase64)786 bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
787                             const signed char* unbase64) {
788   // Determine the size of the output string.  Base64 encodes every 3 bytes into
789   // 4 characters.  Any leftover chars are added directly for good measure.
790   const size_t dest_len = 3 * (slen / 4) + (slen % 4);
791 
792   strings_internal::STLStringResizeUninitialized(dest, dest_len);
793 
794   // We are getting the destination buffer by getting the beginning of the
795   // string and converting it into a char *.
796   size_t len;
797   const bool ok =
798       Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
799   if (!ok) {
800     dest->clear();
801     return false;
802   }
803 
804   // could be shorter if there was padding
805   assert(len <= dest_len);
806   dest->erase(len);
807 
808   return true;
809 }
810 
811 /* clang-format off */
812 constexpr char kHexValueLenient[256] = {
813     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
814     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816     0,  1,  2,  3,  4,  5,  6, 7, 8, 9, 0, 0, 0, 0, 0, 0,  // '0'..'9'
817     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'A'..'F'
818     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
819     0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // 'a'..'f'
820     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
821     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
822     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
823     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
824     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
825     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
826     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
827     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
828     0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
829 };
830 
831 /* clang-format on */
832 
833 // This is a templated function so that T can be either a char*
834 // or a string.  This works because we use the [] operator to access
835 // individual characters at a time.
836 template <typename T>
HexStringToBytesInternal(const char * from,T to,size_t num)837 void HexStringToBytesInternal(const char* from, T to, size_t num) {
838   for (size_t i = 0; i < num; i++) {
839     to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
840             (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
841   }
842 }
843 
844 // This is a templated function so that T can be either a char* or a
845 // std::string.
846 template <typename T>
BytesToHexStringInternal(const unsigned char * src,T dest,size_t num)847 void BytesToHexStringInternal(const unsigned char* src, T dest, size_t num) {
848   auto dest_ptr = &dest[0];
849   for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
850     const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
851     std::copy(hex_p, hex_p + 2, dest_ptr);
852   }
853 }
854 
855 }  // namespace
856 
857 // ----------------------------------------------------------------------
858 // CUnescape()
859 //
860 // See CUnescapeInternal() for implementation details.
861 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,std::string * dest,std::string * error)862 bool CUnescape(absl::string_view source, std::string* dest,
863                std::string* error) {
864   return CUnescapeInternal(source, kUnescapeNulls, dest, error);
865 }
866 
CEscape(absl::string_view src)867 std::string CEscape(absl::string_view src) {
868   std::string dest;
869   CEscapeAndAppendInternal(src, &dest);
870   return dest;
871 }
872 
CHexEscape(absl::string_view src)873 std::string CHexEscape(absl::string_view src) {
874   return CEscapeInternal(src, true, false);
875 }
876 
Utf8SafeCEscape(absl::string_view src)877 std::string Utf8SafeCEscape(absl::string_view src) {
878   return CEscapeInternal(src, false, true);
879 }
880 
Utf8SafeCHexEscape(absl::string_view src)881 std::string Utf8SafeCHexEscape(absl::string_view src) {
882   return CEscapeInternal(src, true, true);
883 }
884 
Base64Unescape(absl::string_view src,std::string * dest)885 bool Base64Unescape(absl::string_view src, std::string* dest) {
886   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
887 }
888 
WebSafeBase64Unescape(absl::string_view src,std::string * dest)889 bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
890   return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
891 }
892 
Base64Escape(absl::string_view src,std::string * dest)893 void Base64Escape(absl::string_view src, std::string* dest) {
894   strings_internal::Base64EscapeInternal(
895       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
896       true, strings_internal::kBase64Chars);
897 }
898 
WebSafeBase64Escape(absl::string_view src,std::string * dest)899 void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
900   strings_internal::Base64EscapeInternal(
901       reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
902       false, strings_internal::kWebSafeBase64Chars);
903 }
904 
Base64Escape(absl::string_view src)905 std::string Base64Escape(absl::string_view src) {
906   std::string dest;
907   strings_internal::Base64EscapeInternal(
908       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
909       true, strings_internal::kBase64Chars);
910   return dest;
911 }
912 
WebSafeBase64Escape(absl::string_view src)913 std::string WebSafeBase64Escape(absl::string_view src) {
914   std::string dest;
915   strings_internal::Base64EscapeInternal(
916       reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
917       false, strings_internal::kWebSafeBase64Chars);
918   return dest;
919 }
920 
HexStringToBytes(absl::string_view from)921 std::string HexStringToBytes(absl::string_view from) {
922   std::string result;
923   const auto num = from.size() / 2;
924   strings_internal::STLStringResizeUninitialized(&result, num);
925   absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
926   return result;
927 }
928 
BytesToHexString(absl::string_view from)929 std::string BytesToHexString(absl::string_view from) {
930   std::string result;
931   strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
932   absl::BytesToHexStringInternal<std::string&>(
933       reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
934   return result;
935 }
936 
937 ABSL_NAMESPACE_END
938 }  // namespace absl
939