1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/strings/escaping.h"
16
17 #include <algorithm>
18 #include <cassert>
19 #include <cstdint>
20 #include <cstring>
21 #include <iterator>
22 #include <limits>
23 #include <string>
24
25 #include "absl/base/internal/endian.h"
26 #include "absl/base/internal/raw_logging.h"
27 #include "absl/base/internal/unaligned_access.h"
28 #include "absl/strings/internal/char_map.h"
29 #include "absl/strings/internal/escaping.h"
30 #include "absl/strings/internal/resize_uninitialized.h"
31 #include "absl/strings/internal/utf8.h"
32 #include "absl/strings/str_cat.h"
33 #include "absl/strings/str_join.h"
34 #include "absl/strings/string_view.h"
35
36 namespace absl {
37 ABSL_NAMESPACE_BEGIN
38 namespace {
39
40 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
41 constexpr bool kUnescapeNulls = false;
42
is_octal_digit(char c)43 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
44
hex_digit_to_int(char c)45 inline unsigned int hex_digit_to_int(char c) {
46 static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
47 "Character set must be ASCII.");
48 assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
49 unsigned int x = static_cast<unsigned char>(c);
50 if (x > '9') {
51 x += 9;
52 }
53 return x & 0xf;
54 }
55
IsSurrogate(char32_t c,absl::string_view src,std::string * error)56 inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
57 if (c >= 0xD800 && c <= 0xDFFF) {
58 if (error) {
59 *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
60 src);
61 }
62 return true;
63 }
64 return false;
65 }
66
67 // ----------------------------------------------------------------------
68 // CUnescapeInternal()
69 // Implements both CUnescape() and CUnescapeForNullTerminatedString().
70 //
71 // Unescapes C escape sequences and is the reverse of CEscape().
72 //
73 // If 'source' is valid, stores the unescaped string and its size in
74 // 'dest' and 'dest_len' respectively, and returns true. Otherwise
75 // returns false and optionally stores the error description in
76 // 'error'. Set 'error' to nullptr to disable error reporting.
77 //
78 // 'dest' should point to a buffer that is at least as big as 'source'.
79 // 'source' and 'dest' may be the same.
80 //
81 // NOTE: any changes to this function must also be reflected in the older
82 // UnescapeCEscapeSequences().
83 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,char * dest,ptrdiff_t * dest_len,std::string * error)84 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
85 char* dest, ptrdiff_t* dest_len, std::string* error) {
86 char* d = dest;
87 const char* p = source.data();
88 const char* end = p + source.size();
89 const char* last_byte = end - 1;
90
91 // Small optimization for case where source = dest and there's no escaping
92 while (p == d && p < end && *p != '\\') p++, d++;
93
94 while (p < end) {
95 if (*p != '\\') {
96 *d++ = *p++;
97 } else {
98 if (++p > last_byte) { // skip past the '\\'
99 if (error) *error = "String cannot end with \\";
100 return false;
101 }
102 switch (*p) {
103 case 'a': *d++ = '\a'; break;
104 case 'b': *d++ = '\b'; break;
105 case 'f': *d++ = '\f'; break;
106 case 'n': *d++ = '\n'; break;
107 case 'r': *d++ = '\r'; break;
108 case 't': *d++ = '\t'; break;
109 case 'v': *d++ = '\v'; break;
110 case '\\': *d++ = '\\'; break;
111 case '?': *d++ = '\?'; break; // \? Who knew?
112 case '\'': *d++ = '\''; break;
113 case '"': *d++ = '\"'; break;
114 case '0':
115 case '1':
116 case '2':
117 case '3':
118 case '4':
119 case '5':
120 case '6':
121 case '7': {
122 // octal digit: 1 to 3 digits
123 const char* octal_start = p;
124 unsigned int ch = static_cast<unsigned int>(*p - '0'); // digit 1
125 if (p < last_byte && is_octal_digit(p[1]))
126 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 2
127 if (p < last_byte && is_octal_digit(p[1]))
128 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 3
129 if (ch > 0xff) {
130 if (error) {
131 *error = "Value of \\" +
132 std::string(octal_start,
133 static_cast<size_t>(p + 1 - octal_start)) +
134 " exceeds 0xff";
135 }
136 return false;
137 }
138 if ((ch == 0) && leave_nulls_escaped) {
139 // Copy the escape sequence for the null character
140 const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
141 *d++ = '\\';
142 memmove(d, octal_start, octal_size);
143 d += octal_size;
144 break;
145 }
146 *d++ = static_cast<char>(ch);
147 break;
148 }
149 case 'x':
150 case 'X': {
151 if (p >= last_byte) {
152 if (error) *error = "String cannot end with \\x";
153 return false;
154 } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
155 if (error) *error = "\\x cannot be followed by a non-hex digit";
156 return false;
157 }
158 unsigned int ch = 0;
159 const char* hex_start = p;
160 while (p < last_byte &&
161 absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
162 // Arbitrarily many hex digits
163 ch = (ch << 4) + hex_digit_to_int(*++p);
164 if (ch > 0xFF) {
165 if (error) {
166 *error = "Value of \\" +
167 std::string(hex_start,
168 static_cast<size_t>(p + 1 - hex_start)) +
169 " exceeds 0xff";
170 }
171 return false;
172 }
173 if ((ch == 0) && leave_nulls_escaped) {
174 // Copy the escape sequence for the null character
175 const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
176 *d++ = '\\';
177 memmove(d, hex_start, hex_size);
178 d += hex_size;
179 break;
180 }
181 *d++ = static_cast<char>(ch);
182 break;
183 }
184 case 'u': {
185 // \uhhhh => convert 4 hex digits to UTF-8
186 char32_t rune = 0;
187 const char* hex_start = p;
188 if (p + 4 >= end) {
189 if (error) {
190 *error = "\\u must be followed by 4 hex digits: \\" +
191 std::string(hex_start,
192 static_cast<size_t>(p + 1 - hex_start));
193 }
194 return false;
195 }
196 for (int i = 0; i < 4; ++i) {
197 // Look one char ahead.
198 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
199 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
200 } else {
201 if (error) {
202 *error = "\\u must be followed by 4 hex digits: \\" +
203 std::string(hex_start,
204 static_cast<size_t>(p + 1 - hex_start));
205 }
206 return false;
207 }
208 }
209 if ((rune == 0) && leave_nulls_escaped) {
210 // Copy the escape sequence for the null character
211 *d++ = '\\';
212 memmove(d, hex_start, 5); // u0000
213 d += 5;
214 break;
215 }
216 if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
217 return false;
218 }
219 d += strings_internal::EncodeUTF8Char(d, rune);
220 break;
221 }
222 case 'U': {
223 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
224 char32_t rune = 0;
225 const char* hex_start = p;
226 if (p + 8 >= end) {
227 if (error) {
228 *error = "\\U must be followed by 8 hex digits: \\" +
229 std::string(hex_start,
230 static_cast<size_t>(p + 1 - hex_start));
231 }
232 return false;
233 }
234 for (int i = 0; i < 8; ++i) {
235 // Look one char ahead.
236 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
237 // Don't change rune until we're sure this
238 // is within the Unicode limit, but do advance p.
239 uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
240 if (newrune > 0x10FFFF) {
241 if (error) {
242 *error = "Value of \\" +
243 std::string(hex_start,
244 static_cast<size_t>(p + 1 - hex_start)) +
245 " exceeds Unicode limit (0x10FFFF)";
246 }
247 return false;
248 } else {
249 rune = newrune;
250 }
251 } else {
252 if (error) {
253 *error = "\\U must be followed by 8 hex digits: \\" +
254 std::string(hex_start,
255 static_cast<size_t>(p + 1 - hex_start));
256 }
257 return false;
258 }
259 }
260 if ((rune == 0) && leave_nulls_escaped) {
261 // Copy the escape sequence for the null character
262 *d++ = '\\';
263 memmove(d, hex_start, 9); // U00000000
264 d += 9;
265 break;
266 }
267 if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
268 return false;
269 }
270 d += strings_internal::EncodeUTF8Char(d, rune);
271 break;
272 }
273 default: {
274 if (error) *error = std::string("Unknown escape sequence: \\") + *p;
275 return false;
276 }
277 }
278 p++; // read past letter we escaped
279 }
280 }
281 *dest_len = d - dest;
282 return true;
283 }
284
285 // ----------------------------------------------------------------------
286 // CUnescapeInternal()
287 //
288 // Same as above but uses a std::string for output. 'source' and 'dest'
289 // may be the same.
290 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,std::string * dest,std::string * error)291 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
292 std::string* dest, std::string* error) {
293 strings_internal::STLStringResizeUninitialized(dest, source.size());
294
295 ptrdiff_t dest_size;
296 if (!CUnescapeInternal(source,
297 leave_nulls_escaped,
298 &(*dest)[0],
299 &dest_size,
300 error)) {
301 return false;
302 }
303 dest->erase(static_cast<size_t>(dest_size));
304 return true;
305 }
306
307 // ----------------------------------------------------------------------
308 // CEscape()
309 // CHexEscape()
310 // Utf8SafeCEscape()
311 // Utf8SafeCHexEscape()
312 // Escapes 'src' using C-style escape sequences. This is useful for
313 // preparing query flags. The 'Hex' version uses hexadecimal rather than
314 // octal sequences. The 'Utf8Safe' version does not touch UTF-8 bytes.
315 //
316 // Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
317 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)318 std::string CEscapeInternal(absl::string_view src, bool use_hex,
319 bool utf8_safe) {
320 std::string dest;
321 bool last_hex_escape = false; // true if last output char was \xNN.
322
323 for (char c : src) {
324 bool is_hex_escape = false;
325 switch (c) {
326 case '\n': dest.append("\\" "n"); break;
327 case '\r': dest.append("\\" "r"); break;
328 case '\t': dest.append("\\" "t"); break;
329 case '\"': dest.append("\\" "\""); break;
330 case '\'': dest.append("\\" "'"); break;
331 case '\\': dest.append("\\" "\\"); break;
332 default: {
333 // Note that if we emit \xNN and the src character after that is a hex
334 // digit then that digit must be escaped too to prevent it being
335 // interpreted as part of the character code by C.
336 const unsigned char uc = static_cast<unsigned char>(c);
337 if ((!utf8_safe || uc < 0x80) &&
338 (!absl::ascii_isprint(uc) ||
339 (last_hex_escape && absl::ascii_isxdigit(uc)))) {
340 if (use_hex) {
341 dest.append("\\" "x");
342 dest.push_back(numbers_internal::kHexChar[uc / 16]);
343 dest.push_back(numbers_internal::kHexChar[uc % 16]);
344 is_hex_escape = true;
345 } else {
346 dest.append("\\");
347 dest.push_back(numbers_internal::kHexChar[uc / 64]);
348 dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
349 dest.push_back(numbers_internal::kHexChar[uc % 8]);
350 }
351 } else {
352 dest.push_back(c);
353 break;
354 }
355 }
356 }
357 last_hex_escape = is_hex_escape;
358 }
359
360 return dest;
361 }
362
363 /* clang-format off */
364 constexpr unsigned char c_escaped_len[256] = {
365 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
366 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
367 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
368 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
369 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
370 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
371 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
372 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
373 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
374 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
375 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
376 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
377 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
378 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
379 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
380 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
381 };
382 /* clang-format on */
383
384 // Calculates the length of the C-style escaped version of 'src'.
385 // Assumes that non-printable characters are escaped using octal sequences, and
386 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)387 inline size_t CEscapedLength(absl::string_view src) {
388 size_t escaped_len = 0;
389 for (char c : src)
390 escaped_len += c_escaped_len[static_cast<unsigned char>(c)];
391 return escaped_len;
392 }
393
CEscapeAndAppendInternal(absl::string_view src,std::string * dest)394 void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
395 size_t escaped_len = CEscapedLength(src);
396 if (escaped_len == src.size()) {
397 dest->append(src.data(), src.size());
398 return;
399 }
400
401 size_t cur_dest_len = dest->size();
402 strings_internal::STLStringResizeUninitialized(dest,
403 cur_dest_len + escaped_len);
404 char* append_ptr = &(*dest)[cur_dest_len];
405
406 for (char c : src) {
407 size_t char_len = c_escaped_len[static_cast<unsigned char>(c)];
408 if (char_len == 1) {
409 *append_ptr++ = c;
410 } else if (char_len == 2) {
411 switch (c) {
412 case '\n':
413 *append_ptr++ = '\\';
414 *append_ptr++ = 'n';
415 break;
416 case '\r':
417 *append_ptr++ = '\\';
418 *append_ptr++ = 'r';
419 break;
420 case '\t':
421 *append_ptr++ = '\\';
422 *append_ptr++ = 't';
423 break;
424 case '\"':
425 *append_ptr++ = '\\';
426 *append_ptr++ = '\"';
427 break;
428 case '\'':
429 *append_ptr++ = '\\';
430 *append_ptr++ = '\'';
431 break;
432 case '\\':
433 *append_ptr++ = '\\';
434 *append_ptr++ = '\\';
435 break;
436 }
437 } else {
438 *append_ptr++ = '\\';
439 *append_ptr++ = '0' + static_cast<unsigned char>(c) / 64;
440 *append_ptr++ = '0' + (static_cast<unsigned char>(c) % 64) / 8;
441 *append_ptr++ = '0' + static_cast<unsigned char>(c) % 8;
442 }
443 }
444 }
445
446 // Reverses the mapping in Base64EscapeInternal; see that method's
447 // documentation for details of the mapping.
Base64UnescapeInternal(const char * src_param,size_t szsrc,char * dest,size_t szdest,const signed char * unbase64,size_t * len)448 bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
449 size_t szdest, const signed char* unbase64,
450 size_t* len) {
451 static const char kPad64Equals = '=';
452 static const char kPad64Dot = '.';
453
454 size_t destidx = 0;
455 int decode = 0;
456 int state = 0;
457 unsigned char ch = 0;
458 unsigned int temp = 0;
459
460 // If "char" is signed by default, using *src as an array index results in
461 // accessing negative array elements. Treat the input as a pointer to
462 // unsigned char to avoid this.
463 const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
464
465 // The GET_INPUT macro gets the next input character, skipping
466 // over any whitespace, and stopping when we reach the end of the
467 // string or when we read any non-data character. The arguments are
468 // an arbitrary identifier (used as a label for goto) and the number
469 // of data bytes that must remain in the input to avoid aborting the
470 // loop.
471 #define GET_INPUT(label, remain) \
472 label: \
473 --szsrc; \
474 ch = *src++; \
475 decode = unbase64[ch]; \
476 if (decode < 0) { \
477 if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
478 state = 4 - remain; \
479 break; \
480 }
481
482 // if dest is null, we're just checking to see if it's legal input
483 // rather than producing output. (I suspect this could just be done
484 // with a regexp...). We duplicate the loop so this test can be
485 // outside it instead of in every iteration.
486
487 if (dest) {
488 // This loop consumes 4 input bytes and produces 3 output bytes
489 // per iteration. We can't know at the start that there is enough
490 // data left in the string for a full iteration, so the loop may
491 // break out in the middle; if so 'state' will be set to the
492 // number of input bytes read.
493
494 while (szsrc >= 4) {
495 // We'll start by optimistically assuming that the next four
496 // bytes of the string (src[0..3]) are four good data bytes
497 // (that is, no nulls, whitespace, padding chars, or illegal
498 // chars). We need to test src[0..2] for nulls individually
499 // before constructing temp to preserve the property that we
500 // never read past a null in the string (no matter how long
501 // szsrc claims the string is).
502
503 if (!src[0] || !src[1] || !src[2] ||
504 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
505 (unsigned(unbase64[src[1]]) << 12) |
506 (unsigned(unbase64[src[2]]) << 6) |
507 (unsigned(unbase64[src[3]])))) &
508 0x80000000)) {
509 // Iff any of those four characters was bad (null, illegal,
510 // whitespace, padding), then temp's high bit will be set
511 // (because unbase64[] is -1 for all bad characters).
512 //
513 // We'll back up and resort to the slower decoder, which knows
514 // how to handle those cases.
515
516 GET_INPUT(first, 4);
517 temp = static_cast<unsigned char>(decode);
518 GET_INPUT(second, 3);
519 temp = (temp << 6) | static_cast<unsigned char>(decode);
520 GET_INPUT(third, 2);
521 temp = (temp << 6) | static_cast<unsigned char>(decode);
522 GET_INPUT(fourth, 1);
523 temp = (temp << 6) | static_cast<unsigned char>(decode);
524 } else {
525 // We really did have four good data bytes, so advance four
526 // characters in the string.
527
528 szsrc -= 4;
529 src += 4;
530 }
531
532 // temp has 24 bits of input, so write that out as three bytes.
533
534 if (destidx + 3 > szdest) return false;
535 dest[destidx + 2] = static_cast<char>(temp);
536 temp >>= 8;
537 dest[destidx + 1] = static_cast<char>(temp);
538 temp >>= 8;
539 dest[destidx] = static_cast<char>(temp);
540 destidx += 3;
541 }
542 } else {
543 while (szsrc >= 4) {
544 if (!src[0] || !src[1] || !src[2] ||
545 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
546 (unsigned(unbase64[src[1]]) << 12) |
547 (unsigned(unbase64[src[2]]) << 6) |
548 (unsigned(unbase64[src[3]])))) &
549 0x80000000)) {
550 GET_INPUT(first_no_dest, 4);
551 GET_INPUT(second_no_dest, 3);
552 GET_INPUT(third_no_dest, 2);
553 GET_INPUT(fourth_no_dest, 1);
554 } else {
555 szsrc -= 4;
556 src += 4;
557 }
558 destidx += 3;
559 }
560 }
561
562 #undef GET_INPUT
563
564 // if the loop terminated because we read a bad character, return
565 // now.
566 if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
567 !absl::ascii_isspace(ch))
568 return false;
569
570 if (ch == kPad64Equals || ch == kPad64Dot) {
571 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
572 // look at it again when we count to check for the proper number of
573 // equals signs at the end.
574 ++szsrc;
575 --src;
576 } else {
577 // This loop consumes 1 input byte per iteration. It's used to
578 // clean up the 0-3 input bytes remaining when the first, faster
579 // loop finishes. 'temp' contains the data from 'state' input
580 // characters read by the first loop.
581 while (szsrc > 0) {
582 --szsrc;
583 ch = *src++;
584 decode = unbase64[ch];
585 if (decode < 0) {
586 if (absl::ascii_isspace(ch)) {
587 continue;
588 } else if (ch == kPad64Equals || ch == kPad64Dot) {
589 // back up one character; we'll read it again when we check
590 // for the correct number of pad characters at the end.
591 ++szsrc;
592 --src;
593 break;
594 } else {
595 return false;
596 }
597 }
598
599 // Each input character gives us six bits of output.
600 temp = (temp << 6) | static_cast<unsigned char>(decode);
601 ++state;
602 if (state == 4) {
603 // If we've accumulated 24 bits of output, write that out as
604 // three bytes.
605 if (dest) {
606 if (destidx + 3 > szdest) return false;
607 dest[destidx + 2] = static_cast<char>(temp);
608 temp >>= 8;
609 dest[destidx + 1] = static_cast<char>(temp);
610 temp >>= 8;
611 dest[destidx] = static_cast<char>(temp);
612 }
613 destidx += 3;
614 state = 0;
615 temp = 0;
616 }
617 }
618 }
619
620 // Process the leftover data contained in 'temp' at the end of the input.
621 int expected_equals = 0;
622 switch (state) {
623 case 0:
624 // Nothing left over; output is a multiple of 3 bytes.
625 break;
626
627 case 1:
628 // Bad input; we have 6 bits left over.
629 return false;
630
631 case 2:
632 // Produce one more output byte from the 12 input bits we have left.
633 if (dest) {
634 if (destidx + 1 > szdest) return false;
635 temp >>= 4;
636 dest[destidx] = static_cast<char>(temp);
637 }
638 ++destidx;
639 expected_equals = 2;
640 break;
641
642 case 3:
643 // Produce two more output bytes from the 18 input bits we have left.
644 if (dest) {
645 if (destidx + 2 > szdest) return false;
646 temp >>= 2;
647 dest[destidx + 1] = static_cast<char>(temp);
648 temp >>= 8;
649 dest[destidx] = static_cast<char>(temp);
650 }
651 destidx += 2;
652 expected_equals = 1;
653 break;
654
655 default:
656 // state should have no other values at this point.
657 ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
658 state);
659 }
660
661 // The remainder of the string should be all whitespace, mixed with
662 // exactly 0 equals signs, or exactly 'expected_equals' equals
663 // signs. (Always accepting 0 equals signs is an Abseil extension
664 // not covered in the RFC, as is accepting dot as the pad character.)
665
666 int equals = 0;
667 while (szsrc > 0) {
668 if (*src == kPad64Equals || *src == kPad64Dot)
669 ++equals;
670 else if (!absl::ascii_isspace(*src))
671 return false;
672 --szsrc;
673 ++src;
674 }
675
676 const bool ok = (equals == 0 || equals == expected_equals);
677 if (ok) *len = destidx;
678 return ok;
679 }
680
681 // The arrays below map base64-escaped characters back to their original values.
682 // For the inverse case, see k(WebSafe)Base64Chars in the internal
683 // escaping.cc.
684 // These arrays were generated by the following inversion code:
685 // #include <sys/time.h>
686 // #include <stdlib.h>
687 // #include <string.h>
688 // main()
689 // {
690 // static const char Base64[] =
691 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
692 // char* pos;
693 // int idx, i, j;
694 // printf(" ");
695 // for (i = 0; i < 255; i += 8) {
696 // for (j = i; j < i + 8; j++) {
697 // pos = strchr(Base64, j);
698 // if ((pos == nullptr) || (j == 0))
699 // idx = -1;
700 // else
701 // idx = pos - Base64;
702 // if (idx == -1)
703 // printf(" %2d, ", idx);
704 // else
705 // printf(" %2d/*%c*/,", idx, j);
706 // }
707 // printf("\n ");
708 // }
709 // }
710 //
711 // where the value of "Base64[]" was replaced by one of k(WebSafe)Base64Chars
712 // in the internal escaping.cc.
713 /* clang-format off */
714 constexpr signed char kUnBase64[] = {
715 -1, -1, -1, -1, -1, -1, -1, -1,
716 -1, -1, -1, -1, -1, -1, -1, -1,
717 -1, -1, -1, -1, -1, -1, -1, -1,
718 -1, -1, -1, -1, -1, -1, -1, -1,
719 -1, -1, -1, -1, -1, -1, -1, -1,
720 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
721 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
722 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
723 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
724 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
725 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
726 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
727 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
728 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
729 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
730 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
731 -1, -1, -1, -1, -1, -1, -1, -1,
732 -1, -1, -1, -1, -1, -1, -1, -1,
733 -1, -1, -1, -1, -1, -1, -1, -1,
734 -1, -1, -1, -1, -1, -1, -1, -1,
735 -1, -1, -1, -1, -1, -1, -1, -1,
736 -1, -1, -1, -1, -1, -1, -1, -1,
737 -1, -1, -1, -1, -1, -1, -1, -1,
738 -1, -1, -1, -1, -1, -1, -1, -1,
739 -1, -1, -1, -1, -1, -1, -1, -1,
740 -1, -1, -1, -1, -1, -1, -1, -1,
741 -1, -1, -1, -1, -1, -1, -1, -1,
742 -1, -1, -1, -1, -1, -1, -1, -1,
743 -1, -1, -1, -1, -1, -1, -1, -1,
744 -1, -1, -1, -1, -1, -1, -1, -1,
745 -1, -1, -1, -1, -1, -1, -1, -1,
746 -1, -1, -1, -1, -1, -1, -1, -1
747 };
748
749 constexpr signed char kUnWebSafeBase64[] = {
750 -1, -1, -1, -1, -1, -1, -1, -1,
751 -1, -1, -1, -1, -1, -1, -1, -1,
752 -1, -1, -1, -1, -1, -1, -1, -1,
753 -1, -1, -1, -1, -1, -1, -1, -1,
754 -1, -1, -1, -1, -1, -1, -1, -1,
755 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
756 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
757 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
758 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
759 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
760 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
761 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
762 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
763 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
764 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
765 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
766 -1, -1, -1, -1, -1, -1, -1, -1,
767 -1, -1, -1, -1, -1, -1, -1, -1,
768 -1, -1, -1, -1, -1, -1, -1, -1,
769 -1, -1, -1, -1, -1, -1, -1, -1,
770 -1, -1, -1, -1, -1, -1, -1, -1,
771 -1, -1, -1, -1, -1, -1, -1, -1,
772 -1, -1, -1, -1, -1, -1, -1, -1,
773 -1, -1, -1, -1, -1, -1, -1, -1,
774 -1, -1, -1, -1, -1, -1, -1, -1,
775 -1, -1, -1, -1, -1, -1, -1, -1,
776 -1, -1, -1, -1, -1, -1, -1, -1,
777 -1, -1, -1, -1, -1, -1, -1, -1,
778 -1, -1, -1, -1, -1, -1, -1, -1,
779 -1, -1, -1, -1, -1, -1, -1, -1,
780 -1, -1, -1, -1, -1, -1, -1, -1,
781 -1, -1, -1, -1, -1, -1, -1, -1
782 };
783 /* clang-format on */
784
785 template <typename String>
Base64UnescapeInternal(const char * src,size_t slen,String * dest,const signed char * unbase64)786 bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
787 const signed char* unbase64) {
788 // Determine the size of the output string. Base64 encodes every 3 bytes into
789 // 4 characters. Any leftover chars are added directly for good measure.
790 const size_t dest_len = 3 * (slen / 4) + (slen % 4);
791
792 strings_internal::STLStringResizeUninitialized(dest, dest_len);
793
794 // We are getting the destination buffer by getting the beginning of the
795 // string and converting it into a char *.
796 size_t len;
797 const bool ok =
798 Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
799 if (!ok) {
800 dest->clear();
801 return false;
802 }
803
804 // could be shorter if there was padding
805 assert(len <= dest_len);
806 dest->erase(len);
807
808 return true;
809 }
810
811 /* clang-format off */
812 constexpr char kHexValueLenient[256] = {
813 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9'
817 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F'
818 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
819 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'a'..'f'
820 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
821 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
822 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
823 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
825 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
829 };
830
831 /* clang-format on */
832
833 // This is a templated function so that T can be either a char*
834 // or a string. This works because we use the [] operator to access
835 // individual characters at a time.
836 template <typename T>
HexStringToBytesInternal(const char * from,T to,size_t num)837 void HexStringToBytesInternal(const char* from, T to, size_t num) {
838 for (size_t i = 0; i < num; i++) {
839 to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
840 (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
841 }
842 }
843
844 // This is a templated function so that T can be either a char* or a
845 // std::string.
846 template <typename T>
BytesToHexStringInternal(const unsigned char * src,T dest,size_t num)847 void BytesToHexStringInternal(const unsigned char* src, T dest, size_t num) {
848 auto dest_ptr = &dest[0];
849 for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
850 const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
851 std::copy(hex_p, hex_p + 2, dest_ptr);
852 }
853 }
854
855 } // namespace
856
857 // ----------------------------------------------------------------------
858 // CUnescape()
859 //
860 // See CUnescapeInternal() for implementation details.
861 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,std::string * dest,std::string * error)862 bool CUnescape(absl::string_view source, std::string* dest,
863 std::string* error) {
864 return CUnescapeInternal(source, kUnescapeNulls, dest, error);
865 }
866
CEscape(absl::string_view src)867 std::string CEscape(absl::string_view src) {
868 std::string dest;
869 CEscapeAndAppendInternal(src, &dest);
870 return dest;
871 }
872
CHexEscape(absl::string_view src)873 std::string CHexEscape(absl::string_view src) {
874 return CEscapeInternal(src, true, false);
875 }
876
Utf8SafeCEscape(absl::string_view src)877 std::string Utf8SafeCEscape(absl::string_view src) {
878 return CEscapeInternal(src, false, true);
879 }
880
Utf8SafeCHexEscape(absl::string_view src)881 std::string Utf8SafeCHexEscape(absl::string_view src) {
882 return CEscapeInternal(src, true, true);
883 }
884
Base64Unescape(absl::string_view src,std::string * dest)885 bool Base64Unescape(absl::string_view src, std::string* dest) {
886 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
887 }
888
WebSafeBase64Unescape(absl::string_view src,std::string * dest)889 bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
890 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
891 }
892
Base64Escape(absl::string_view src,std::string * dest)893 void Base64Escape(absl::string_view src, std::string* dest) {
894 strings_internal::Base64EscapeInternal(
895 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
896 true, strings_internal::kBase64Chars);
897 }
898
WebSafeBase64Escape(absl::string_view src,std::string * dest)899 void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
900 strings_internal::Base64EscapeInternal(
901 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
902 false, strings_internal::kWebSafeBase64Chars);
903 }
904
Base64Escape(absl::string_view src)905 std::string Base64Escape(absl::string_view src) {
906 std::string dest;
907 strings_internal::Base64EscapeInternal(
908 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
909 true, strings_internal::kBase64Chars);
910 return dest;
911 }
912
WebSafeBase64Escape(absl::string_view src)913 std::string WebSafeBase64Escape(absl::string_view src) {
914 std::string dest;
915 strings_internal::Base64EscapeInternal(
916 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
917 false, strings_internal::kWebSafeBase64Chars);
918 return dest;
919 }
920
HexStringToBytes(absl::string_view from)921 std::string HexStringToBytes(absl::string_view from) {
922 std::string result;
923 const auto num = from.size() / 2;
924 strings_internal::STLStringResizeUninitialized(&result, num);
925 absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
926 return result;
927 }
928
BytesToHexString(absl::string_view from)929 std::string BytesToHexString(absl::string_view from) {
930 std::string result;
931 strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
932 absl::BytesToHexStringInternal<std::string&>(
933 reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
934 return result;
935 }
936
937 ABSL_NAMESPACE_END
938 } // namespace absl
939