1 // Copyright 2017 The Abseil Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "absl/strings/escaping.h"
16
17 #include <algorithm>
18 #include <cassert>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 #include <limits>
23 #include <string>
24
25 #include "absl/base/config.h"
26 #include "absl/base/internal/raw_logging.h"
27 #include "absl/base/internal/unaligned_access.h"
28 #include "absl/strings/ascii.h"
29 #include "absl/strings/charset.h"
30 #include "absl/strings/internal/escaping.h"
31 #include "absl/strings/internal/resize_uninitialized.h"
32 #include "absl/strings/internal/utf8.h"
33 #include "absl/strings/numbers.h"
34 #include "absl/strings/str_cat.h"
35 #include "absl/strings/string_view.h"
36
37 namespace absl {
38 ABSL_NAMESPACE_BEGIN
39 namespace {
40
41 // These are used for the leave_nulls_escaped argument to CUnescapeInternal().
42 constexpr bool kUnescapeNulls = false;
43
is_octal_digit(char c)44 inline bool is_octal_digit(char c) { return ('0' <= c) && (c <= '7'); }
45
hex_digit_to_int(char c)46 inline unsigned int hex_digit_to_int(char c) {
47 static_assert('0' == 0x30 && 'A' == 0x41 && 'a' == 0x61,
48 "Character set must be ASCII.");
49 assert(absl::ascii_isxdigit(static_cast<unsigned char>(c)));
50 unsigned int x = static_cast<unsigned char>(c);
51 if (x > '9') {
52 x += 9;
53 }
54 return x & 0xf;
55 }
56
IsSurrogate(char32_t c,absl::string_view src,std::string * error)57 inline bool IsSurrogate(char32_t c, absl::string_view src, std::string* error) {
58 if (c >= 0xD800 && c <= 0xDFFF) {
59 if (error) {
60 *error = absl::StrCat("invalid surrogate character (0xD800-DFFF): \\",
61 src);
62 }
63 return true;
64 }
65 return false;
66 }
67
68 // ----------------------------------------------------------------------
69 // CUnescapeInternal()
70 // Implements both CUnescape() and CUnescapeForNullTerminatedString().
71 //
72 // Unescapes C escape sequences and is the reverse of CEscape().
73 //
74 // If 'source' is valid, stores the unescaped string and its size in
75 // 'dest' and 'dest_len' respectively, and returns true. Otherwise
76 // returns false and optionally stores the error description in
77 // 'error'. Set 'error' to nullptr to disable error reporting.
78 //
79 // 'dest' should point to a buffer that is at least as big as 'source'.
80 // 'source' and 'dest' may be the same.
81 //
82 // NOTE: any changes to this function must also be reflected in the older
83 // UnescapeCEscapeSequences().
84 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,char * dest,ptrdiff_t * dest_len,std::string * error)85 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
86 char* dest, ptrdiff_t* dest_len, std::string* error) {
87 char* d = dest;
88 const char* p = source.data();
89 const char* end = p + source.size();
90 const char* last_byte = end - 1;
91
92 // Small optimization for case where source = dest and there's no escaping
93 while (p == d && p < end && *p != '\\') p++, d++;
94
95 while (p < end) {
96 if (*p != '\\') {
97 *d++ = *p++;
98 } else {
99 if (++p > last_byte) { // skip past the '\\'
100 if (error) *error = "String cannot end with \\";
101 return false;
102 }
103 switch (*p) {
104 case 'a': *d++ = '\a'; break;
105 case 'b': *d++ = '\b'; break;
106 case 'f': *d++ = '\f'; break;
107 case 'n': *d++ = '\n'; break;
108 case 'r': *d++ = '\r'; break;
109 case 't': *d++ = '\t'; break;
110 case 'v': *d++ = '\v'; break;
111 case '\\': *d++ = '\\'; break;
112 case '?': *d++ = '\?'; break; // \? Who knew?
113 case '\'': *d++ = '\''; break;
114 case '"': *d++ = '\"'; break;
115 case '0':
116 case '1':
117 case '2':
118 case '3':
119 case '4':
120 case '5':
121 case '6':
122 case '7': {
123 // octal digit: 1 to 3 digits
124 const char* octal_start = p;
125 unsigned int ch = static_cast<unsigned int>(*p - '0'); // digit 1
126 if (p < last_byte && is_octal_digit(p[1]))
127 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 2
128 if (p < last_byte && is_octal_digit(p[1]))
129 ch = ch * 8 + static_cast<unsigned int>(*++p - '0'); // digit 3
130 if (ch > 0xff) {
131 if (error) {
132 *error = "Value of \\" +
133 std::string(octal_start,
134 static_cast<size_t>(p + 1 - octal_start)) +
135 " exceeds 0xff";
136 }
137 return false;
138 }
139 if ((ch == 0) && leave_nulls_escaped) {
140 // Copy the escape sequence for the null character
141 const size_t octal_size = static_cast<size_t>(p + 1 - octal_start);
142 *d++ = '\\';
143 memmove(d, octal_start, octal_size);
144 d += octal_size;
145 break;
146 }
147 *d++ = static_cast<char>(ch);
148 break;
149 }
150 case 'x':
151 case 'X': {
152 if (p >= last_byte) {
153 if (error) *error = "String cannot end with \\x";
154 return false;
155 } else if (!absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
156 if (error) *error = "\\x cannot be followed by a non-hex digit";
157 return false;
158 }
159 unsigned int ch = 0;
160 const char* hex_start = p;
161 while (p < last_byte &&
162 absl::ascii_isxdigit(static_cast<unsigned char>(p[1])))
163 // Arbitrarily many hex digits
164 ch = (ch << 4) + hex_digit_to_int(*++p);
165 if (ch > 0xFF) {
166 if (error) {
167 *error = "Value of \\" +
168 std::string(hex_start,
169 static_cast<size_t>(p + 1 - hex_start)) +
170 " exceeds 0xff";
171 }
172 return false;
173 }
174 if ((ch == 0) && leave_nulls_escaped) {
175 // Copy the escape sequence for the null character
176 const size_t hex_size = static_cast<size_t>(p + 1 - hex_start);
177 *d++ = '\\';
178 memmove(d, hex_start, hex_size);
179 d += hex_size;
180 break;
181 }
182 *d++ = static_cast<char>(ch);
183 break;
184 }
185 case 'u': {
186 // \uhhhh => convert 4 hex digits to UTF-8
187 char32_t rune = 0;
188 const char* hex_start = p;
189 if (p + 4 >= end) {
190 if (error) {
191 *error = "\\u must be followed by 4 hex digits: \\" +
192 std::string(hex_start,
193 static_cast<size_t>(p + 1 - hex_start));
194 }
195 return false;
196 }
197 for (int i = 0; i < 4; ++i) {
198 // Look one char ahead.
199 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
200 rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
201 } else {
202 if (error) {
203 *error = "\\u must be followed by 4 hex digits: \\" +
204 std::string(hex_start,
205 static_cast<size_t>(p + 1 - hex_start));
206 }
207 return false;
208 }
209 }
210 if ((rune == 0) && leave_nulls_escaped) {
211 // Copy the escape sequence for the null character
212 *d++ = '\\';
213 memmove(d, hex_start, 5); // u0000
214 d += 5;
215 break;
216 }
217 if (IsSurrogate(rune, absl::string_view(hex_start, 5), error)) {
218 return false;
219 }
220 d += strings_internal::EncodeUTF8Char(d, rune);
221 break;
222 }
223 case 'U': {
224 // \Uhhhhhhhh => convert 8 hex digits to UTF-8
225 char32_t rune = 0;
226 const char* hex_start = p;
227 if (p + 8 >= end) {
228 if (error) {
229 *error = "\\U must be followed by 8 hex digits: \\" +
230 std::string(hex_start,
231 static_cast<size_t>(p + 1 - hex_start));
232 }
233 return false;
234 }
235 for (int i = 0; i < 8; ++i) {
236 // Look one char ahead.
237 if (absl::ascii_isxdigit(static_cast<unsigned char>(p[1]))) {
238 // Don't change rune until we're sure this
239 // is within the Unicode limit, but do advance p.
240 uint32_t newrune = (rune << 4) + hex_digit_to_int(*++p);
241 if (newrune > 0x10FFFF) {
242 if (error) {
243 *error = "Value of \\" +
244 std::string(hex_start,
245 static_cast<size_t>(p + 1 - hex_start)) +
246 " exceeds Unicode limit (0x10FFFF)";
247 }
248 return false;
249 } else {
250 rune = newrune;
251 }
252 } else {
253 if (error) {
254 *error = "\\U must be followed by 8 hex digits: \\" +
255 std::string(hex_start,
256 static_cast<size_t>(p + 1 - hex_start));
257 }
258 return false;
259 }
260 }
261 if ((rune == 0) && leave_nulls_escaped) {
262 // Copy the escape sequence for the null character
263 *d++ = '\\';
264 memmove(d, hex_start, 9); // U00000000
265 d += 9;
266 break;
267 }
268 if (IsSurrogate(rune, absl::string_view(hex_start, 9), error)) {
269 return false;
270 }
271 d += strings_internal::EncodeUTF8Char(d, rune);
272 break;
273 }
274 default: {
275 if (error) *error = std::string("Unknown escape sequence: \\") + *p;
276 return false;
277 }
278 }
279 p++; // read past letter we escaped
280 }
281 }
282 *dest_len = d - dest;
283 return true;
284 }
285
286 // ----------------------------------------------------------------------
287 // CUnescapeInternal()
288 //
289 // Same as above but uses a std::string for output. 'source' and 'dest'
290 // may be the same.
291 // ----------------------------------------------------------------------
CUnescapeInternal(absl::string_view source,bool leave_nulls_escaped,std::string * dest,std::string * error)292 bool CUnescapeInternal(absl::string_view source, bool leave_nulls_escaped,
293 std::string* dest, std::string* error) {
294 strings_internal::STLStringResizeUninitialized(dest, source.size());
295
296 ptrdiff_t dest_size;
297 if (!CUnescapeInternal(source,
298 leave_nulls_escaped,
299 &(*dest)[0],
300 &dest_size,
301 error)) {
302 return false;
303 }
304 dest->erase(static_cast<size_t>(dest_size));
305 return true;
306 }
307
308 // ----------------------------------------------------------------------
309 // CEscape()
310 // CHexEscape()
311 // Utf8SafeCEscape()
312 // Utf8SafeCHexEscape()
313 // Escapes 'src' using C-style escape sequences. This is useful for
314 // preparing query flags. The 'Hex' version uses hexadecimal rather than
315 // octal sequences. The 'Utf8Safe' version does not touch UTF-8 bytes.
316 //
317 // Escaped chars: \n, \r, \t, ", ', \, and !absl::ascii_isprint().
318 // ----------------------------------------------------------------------
CEscapeInternal(absl::string_view src,bool use_hex,bool utf8_safe)319 std::string CEscapeInternal(absl::string_view src, bool use_hex,
320 bool utf8_safe) {
321 std::string dest;
322 bool last_hex_escape = false; // true if last output char was \xNN.
323
324 for (char c : src) {
325 bool is_hex_escape = false;
326 switch (c) {
327 case '\n': dest.append("\\" "n"); break;
328 case '\r': dest.append("\\" "r"); break;
329 case '\t': dest.append("\\" "t"); break;
330 case '\"': dest.append("\\" "\""); break;
331 case '\'': dest.append("\\" "'"); break;
332 case '\\': dest.append("\\" "\\"); break;
333 default: {
334 // Note that if we emit \xNN and the src character after that is a hex
335 // digit then that digit must be escaped too to prevent it being
336 // interpreted as part of the character code by C.
337 const unsigned char uc = static_cast<unsigned char>(c);
338 if ((!utf8_safe || uc < 0x80) &&
339 (!absl::ascii_isprint(uc) ||
340 (last_hex_escape && absl::ascii_isxdigit(uc)))) {
341 if (use_hex) {
342 dest.append("\\" "x");
343 dest.push_back(numbers_internal::kHexChar[uc / 16]);
344 dest.push_back(numbers_internal::kHexChar[uc % 16]);
345 is_hex_escape = true;
346 } else {
347 dest.append("\\");
348 dest.push_back(numbers_internal::kHexChar[uc / 64]);
349 dest.push_back(numbers_internal::kHexChar[(uc % 64) / 8]);
350 dest.push_back(numbers_internal::kHexChar[uc % 8]);
351 }
352 } else {
353 dest.push_back(c);
354 break;
355 }
356 }
357 }
358 last_hex_escape = is_hex_escape;
359 }
360
361 return dest;
362 }
363
364 /* clang-format off */
365 constexpr unsigned char c_escaped_len[256] = {
366 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 2, 4, 4, 2, 4, 4, // \t, \n, \r
367 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
368 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // ", '
369 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // '0'..'9'
370 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'A'..'O'
371 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, // 'P'..'Z', '\'
372 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 'a'..'o'
373 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, // 'p'..'z', DEL
374 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
375 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
376 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
377 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
378 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
379 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
380 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
381 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
382 };
383 /* clang-format on */
384
385 // Calculates the length of the C-style escaped version of 'src'.
386 // Assumes that non-printable characters are escaped using octal sequences, and
387 // that UTF-8 bytes are not handled specially.
CEscapedLength(absl::string_view src)388 inline size_t CEscapedLength(absl::string_view src) {
389 size_t escaped_len = 0;
390 for (char c : src)
391 escaped_len += c_escaped_len[static_cast<unsigned char>(c)];
392 return escaped_len;
393 }
394
CEscapeAndAppendInternal(absl::string_view src,std::string * dest)395 void CEscapeAndAppendInternal(absl::string_view src, std::string* dest) {
396 size_t escaped_len = CEscapedLength(src);
397 if (escaped_len == src.size()) {
398 dest->append(src.data(), src.size());
399 return;
400 }
401
402 size_t cur_dest_len = dest->size();
403 strings_internal::STLStringResizeUninitialized(dest,
404 cur_dest_len + escaped_len);
405 char* append_ptr = &(*dest)[cur_dest_len];
406
407 for (char c : src) {
408 size_t char_len = c_escaped_len[static_cast<unsigned char>(c)];
409 if (char_len == 1) {
410 *append_ptr++ = c;
411 } else if (char_len == 2) {
412 switch (c) {
413 case '\n':
414 *append_ptr++ = '\\';
415 *append_ptr++ = 'n';
416 break;
417 case '\r':
418 *append_ptr++ = '\\';
419 *append_ptr++ = 'r';
420 break;
421 case '\t':
422 *append_ptr++ = '\\';
423 *append_ptr++ = 't';
424 break;
425 case '\"':
426 *append_ptr++ = '\\';
427 *append_ptr++ = '\"';
428 break;
429 case '\'':
430 *append_ptr++ = '\\';
431 *append_ptr++ = '\'';
432 break;
433 case '\\':
434 *append_ptr++ = '\\';
435 *append_ptr++ = '\\';
436 break;
437 }
438 } else {
439 *append_ptr++ = '\\';
440 *append_ptr++ = '0' + static_cast<unsigned char>(c) / 64;
441 *append_ptr++ = '0' + (static_cast<unsigned char>(c) % 64) / 8;
442 *append_ptr++ = '0' + static_cast<unsigned char>(c) % 8;
443 }
444 }
445 }
446
447 // Reverses the mapping in Base64EscapeInternal; see that method's
448 // documentation for details of the mapping.
Base64UnescapeInternal(const char * src_param,size_t szsrc,char * dest,size_t szdest,const signed char * unbase64,size_t * len)449 bool Base64UnescapeInternal(const char* src_param, size_t szsrc, char* dest,
450 size_t szdest, const signed char* unbase64,
451 size_t* len) {
452 static const char kPad64Equals = '=';
453 static const char kPad64Dot = '.';
454
455 size_t destidx = 0;
456 int decode = 0;
457 int state = 0;
458 unsigned char ch = 0;
459 unsigned int temp = 0;
460
461 // If "char" is signed by default, using *src as an array index results in
462 // accessing negative array elements. Treat the input as a pointer to
463 // unsigned char to avoid this.
464 const unsigned char* src = reinterpret_cast<const unsigned char*>(src_param);
465
466 // The GET_INPUT macro gets the next input character, skipping
467 // over any whitespace, and stopping when we reach the end of the
468 // string or when we read any non-data character. The arguments are
469 // an arbitrary identifier (used as a label for goto) and the number
470 // of data bytes that must remain in the input to avoid aborting the
471 // loop.
472 #define GET_INPUT(label, remain) \
473 label: \
474 --szsrc; \
475 ch = *src++; \
476 decode = unbase64[ch]; \
477 if (decode < 0) { \
478 if (absl::ascii_isspace(ch) && szsrc >= remain) goto label; \
479 state = 4 - remain; \
480 break; \
481 }
482
483 // if dest is null, we're just checking to see if it's legal input
484 // rather than producing output. (I suspect this could just be done
485 // with a regexp...). We duplicate the loop so this test can be
486 // outside it instead of in every iteration.
487
488 if (dest) {
489 // This loop consumes 4 input bytes and produces 3 output bytes
490 // per iteration. We can't know at the start that there is enough
491 // data left in the string for a full iteration, so the loop may
492 // break out in the middle; if so 'state' will be set to the
493 // number of input bytes read.
494
495 while (szsrc >= 4) {
496 // We'll start by optimistically assuming that the next four
497 // bytes of the string (src[0..3]) are four good data bytes
498 // (that is, no nulls, whitespace, padding chars, or illegal
499 // chars). We need to test src[0..2] for nulls individually
500 // before constructing temp to preserve the property that we
501 // never read past a null in the string (no matter how long
502 // szsrc claims the string is).
503
504 if (!src[0] || !src[1] || !src[2] ||
505 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
506 (unsigned(unbase64[src[1]]) << 12) |
507 (unsigned(unbase64[src[2]]) << 6) |
508 (unsigned(unbase64[src[3]])))) &
509 0x80000000)) {
510 // Iff any of those four characters was bad (null, illegal,
511 // whitespace, padding), then temp's high bit will be set
512 // (because unbase64[] is -1 for all bad characters).
513 //
514 // We'll back up and resort to the slower decoder, which knows
515 // how to handle those cases.
516
517 GET_INPUT(first, 4);
518 temp = static_cast<unsigned char>(decode);
519 GET_INPUT(second, 3);
520 temp = (temp << 6) | static_cast<unsigned char>(decode);
521 GET_INPUT(third, 2);
522 temp = (temp << 6) | static_cast<unsigned char>(decode);
523 GET_INPUT(fourth, 1);
524 temp = (temp << 6) | static_cast<unsigned char>(decode);
525 } else {
526 // We really did have four good data bytes, so advance four
527 // characters in the string.
528
529 szsrc -= 4;
530 src += 4;
531 }
532
533 // temp has 24 bits of input, so write that out as three bytes.
534
535 if (destidx + 3 > szdest) return false;
536 dest[destidx + 2] = static_cast<char>(temp);
537 temp >>= 8;
538 dest[destidx + 1] = static_cast<char>(temp);
539 temp >>= 8;
540 dest[destidx] = static_cast<char>(temp);
541 destidx += 3;
542 }
543 } else {
544 while (szsrc >= 4) {
545 if (!src[0] || !src[1] || !src[2] ||
546 ((temp = ((unsigned(unbase64[src[0]]) << 18) |
547 (unsigned(unbase64[src[1]]) << 12) |
548 (unsigned(unbase64[src[2]]) << 6) |
549 (unsigned(unbase64[src[3]])))) &
550 0x80000000)) {
551 GET_INPUT(first_no_dest, 4);
552 GET_INPUT(second_no_dest, 3);
553 GET_INPUT(third_no_dest, 2);
554 GET_INPUT(fourth_no_dest, 1);
555 } else {
556 szsrc -= 4;
557 src += 4;
558 }
559 destidx += 3;
560 }
561 }
562
563 #undef GET_INPUT
564
565 // if the loop terminated because we read a bad character, return
566 // now.
567 if (decode < 0 && ch != kPad64Equals && ch != kPad64Dot &&
568 !absl::ascii_isspace(ch))
569 return false;
570
571 if (ch == kPad64Equals || ch == kPad64Dot) {
572 // if we stopped by hitting an '=' or '.', un-read that character -- we'll
573 // look at it again when we count to check for the proper number of
574 // equals signs at the end.
575 ++szsrc;
576 --src;
577 } else {
578 // This loop consumes 1 input byte per iteration. It's used to
579 // clean up the 0-3 input bytes remaining when the first, faster
580 // loop finishes. 'temp' contains the data from 'state' input
581 // characters read by the first loop.
582 while (szsrc > 0) {
583 --szsrc;
584 ch = *src++;
585 decode = unbase64[ch];
586 if (decode < 0) {
587 if (absl::ascii_isspace(ch)) {
588 continue;
589 } else if (ch == kPad64Equals || ch == kPad64Dot) {
590 // back up one character; we'll read it again when we check
591 // for the correct number of pad characters at the end.
592 ++szsrc;
593 --src;
594 break;
595 } else {
596 return false;
597 }
598 }
599
600 // Each input character gives us six bits of output.
601 temp = (temp << 6) | static_cast<unsigned char>(decode);
602 ++state;
603 if (state == 4) {
604 // If we've accumulated 24 bits of output, write that out as
605 // three bytes.
606 if (dest) {
607 if (destidx + 3 > szdest) return false;
608 dest[destidx + 2] = static_cast<char>(temp);
609 temp >>= 8;
610 dest[destidx + 1] = static_cast<char>(temp);
611 temp >>= 8;
612 dest[destidx] = static_cast<char>(temp);
613 }
614 destidx += 3;
615 state = 0;
616 temp = 0;
617 }
618 }
619 }
620
621 // Process the leftover data contained in 'temp' at the end of the input.
622 int expected_equals = 0;
623 switch (state) {
624 case 0:
625 // Nothing left over; output is a multiple of 3 bytes.
626 break;
627
628 case 1:
629 // Bad input; we have 6 bits left over.
630 return false;
631
632 case 2:
633 // Produce one more output byte from the 12 input bits we have left.
634 if (dest) {
635 if (destidx + 1 > szdest) return false;
636 temp >>= 4;
637 dest[destidx] = static_cast<char>(temp);
638 }
639 ++destidx;
640 expected_equals = 2;
641 break;
642
643 case 3:
644 // Produce two more output bytes from the 18 input bits we have left.
645 if (dest) {
646 if (destidx + 2 > szdest) return false;
647 temp >>= 2;
648 dest[destidx + 1] = static_cast<char>(temp);
649 temp >>= 8;
650 dest[destidx] = static_cast<char>(temp);
651 }
652 destidx += 2;
653 expected_equals = 1;
654 break;
655
656 default:
657 // state should have no other values at this point.
658 ABSL_RAW_LOG(FATAL, "This can't happen; base64 decoder state = %d",
659 state);
660 }
661
662 // The remainder of the string should be all whitespace, mixed with
663 // exactly 0 equals signs, or exactly 'expected_equals' equals
664 // signs. (Always accepting 0 equals signs is an Abseil extension
665 // not covered in the RFC, as is accepting dot as the pad character.)
666
667 int equals = 0;
668 while (szsrc > 0) {
669 if (*src == kPad64Equals || *src == kPad64Dot)
670 ++equals;
671 else if (!absl::ascii_isspace(*src))
672 return false;
673 --szsrc;
674 ++src;
675 }
676
677 const bool ok = (equals == 0 || equals == expected_equals);
678 if (ok) *len = destidx;
679 return ok;
680 }
681
682 // The arrays below map base64-escaped characters back to their original values.
683 // For the inverse case, see k(WebSafe)Base64Chars in the internal
684 // escaping.cc.
685 // These arrays were generated by the following inversion code:
686 // #include <sys/time.h>
687 // #include <stdlib.h>
688 // #include <string.h>
689 // main()
690 // {
691 // static const char Base64[] =
692 // "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
693 // char* pos;
694 // int idx, i, j;
695 // printf(" ");
696 // for (i = 0; i < 255; i += 8) {
697 // for (j = i; j < i + 8; j++) {
698 // pos = strchr(Base64, j);
699 // if ((pos == nullptr) || (j == 0))
700 // idx = -1;
701 // else
702 // idx = pos - Base64;
703 // if (idx == -1)
704 // printf(" %2d, ", idx);
705 // else
706 // printf(" %2d/*%c*/,", idx, j);
707 // }
708 // printf("\n ");
709 // }
710 // }
711 //
712 // where the value of "Base64[]" was replaced by one of k(WebSafe)Base64Chars
713 // in the internal escaping.cc.
714 /* clang-format off */
715 constexpr signed char kUnBase64[] = {
716 -1, -1, -1, -1, -1, -1, -1, -1,
717 -1, -1, -1, -1, -1, -1, -1, -1,
718 -1, -1, -1, -1, -1, -1, -1, -1,
719 -1, -1, -1, -1, -1, -1, -1, -1,
720 -1, -1, -1, -1, -1, -1, -1, -1,
721 -1, -1, -1, 62/*+*/, -1, -1, -1, 63/*/ */,
722 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
723 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
724 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
725 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
726 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
727 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, -1,
728 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
729 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
730 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
731 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
732 -1, -1, -1, -1, -1, -1, -1, -1,
733 -1, -1, -1, -1, -1, -1, -1, -1,
734 -1, -1, -1, -1, -1, -1, -1, -1,
735 -1, -1, -1, -1, -1, -1, -1, -1,
736 -1, -1, -1, -1, -1, -1, -1, -1,
737 -1, -1, -1, -1, -1, -1, -1, -1,
738 -1, -1, -1, -1, -1, -1, -1, -1,
739 -1, -1, -1, -1, -1, -1, -1, -1,
740 -1, -1, -1, -1, -1, -1, -1, -1,
741 -1, -1, -1, -1, -1, -1, -1, -1,
742 -1, -1, -1, -1, -1, -1, -1, -1,
743 -1, -1, -1, -1, -1, -1, -1, -1,
744 -1, -1, -1, -1, -1, -1, -1, -1,
745 -1, -1, -1, -1, -1, -1, -1, -1,
746 -1, -1, -1, -1, -1, -1, -1, -1,
747 -1, -1, -1, -1, -1, -1, -1, -1
748 };
749
750 constexpr signed char kUnWebSafeBase64[] = {
751 -1, -1, -1, -1, -1, -1, -1, -1,
752 -1, -1, -1, -1, -1, -1, -1, -1,
753 -1, -1, -1, -1, -1, -1, -1, -1,
754 -1, -1, -1, -1, -1, -1, -1, -1,
755 -1, -1, -1, -1, -1, -1, -1, -1,
756 -1, -1, -1, -1, -1, 62/*-*/, -1, -1,
757 52/*0*/, 53/*1*/, 54/*2*/, 55/*3*/, 56/*4*/, 57/*5*/, 58/*6*/, 59/*7*/,
758 60/*8*/, 61/*9*/, -1, -1, -1, -1, -1, -1,
759 -1, 0/*A*/, 1/*B*/, 2/*C*/, 3/*D*/, 4/*E*/, 5/*F*/, 6/*G*/,
760 07/*H*/, 8/*I*/, 9/*J*/, 10/*K*/, 11/*L*/, 12/*M*/, 13/*N*/, 14/*O*/,
761 15/*P*/, 16/*Q*/, 17/*R*/, 18/*S*/, 19/*T*/, 20/*U*/, 21/*V*/, 22/*W*/,
762 23/*X*/, 24/*Y*/, 25/*Z*/, -1, -1, -1, -1, 63/*_*/,
763 -1, 26/*a*/, 27/*b*/, 28/*c*/, 29/*d*/, 30/*e*/, 31/*f*/, 32/*g*/,
764 33/*h*/, 34/*i*/, 35/*j*/, 36/*k*/, 37/*l*/, 38/*m*/, 39/*n*/, 40/*o*/,
765 41/*p*/, 42/*q*/, 43/*r*/, 44/*s*/, 45/*t*/, 46/*u*/, 47/*v*/, 48/*w*/,
766 49/*x*/, 50/*y*/, 51/*z*/, -1, -1, -1, -1, -1,
767 -1, -1, -1, -1, -1, -1, -1, -1,
768 -1, -1, -1, -1, -1, -1, -1, -1,
769 -1, -1, -1, -1, -1, -1, -1, -1,
770 -1, -1, -1, -1, -1, -1, -1, -1,
771 -1, -1, -1, -1, -1, -1, -1, -1,
772 -1, -1, -1, -1, -1, -1, -1, -1,
773 -1, -1, -1, -1, -1, -1, -1, -1,
774 -1, -1, -1, -1, -1, -1, -1, -1,
775 -1, -1, -1, -1, -1, -1, -1, -1,
776 -1, -1, -1, -1, -1, -1, -1, -1,
777 -1, -1, -1, -1, -1, -1, -1, -1,
778 -1, -1, -1, -1, -1, -1, -1, -1,
779 -1, -1, -1, -1, -1, -1, -1, -1,
780 -1, -1, -1, -1, -1, -1, -1, -1,
781 -1, -1, -1, -1, -1, -1, -1, -1,
782 -1, -1, -1, -1, -1, -1, -1, -1
783 };
784 /* clang-format on */
785
786 template <typename String>
Base64UnescapeInternal(const char * src,size_t slen,String * dest,const signed char * unbase64)787 bool Base64UnescapeInternal(const char* src, size_t slen, String* dest,
788 const signed char* unbase64) {
789 // Determine the size of the output string. Base64 encodes every 3 bytes into
790 // 4 characters. Any leftover chars are added directly for good measure.
791 const size_t dest_len = 3 * (slen / 4) + (slen % 4);
792
793 strings_internal::STLStringResizeUninitialized(dest, dest_len);
794
795 // We are getting the destination buffer by getting the beginning of the
796 // string and converting it into a char *.
797 size_t len;
798 const bool ok =
799 Base64UnescapeInternal(src, slen, &(*dest)[0], dest_len, unbase64, &len);
800 if (!ok) {
801 dest->clear();
802 return false;
803 }
804
805 // could be shorter if there was padding
806 assert(len <= dest_len);
807 dest->erase(len);
808
809 return true;
810 }
811
812 /* clang-format off */
813 constexpr char kHexValueLenient[256] = {
814 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
815 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
816 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
817 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9'
818 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F'
819 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
820 0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'a'..'f'
821 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
822 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
823 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
824 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
825 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
826 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
827 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
829 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
830 };
831
832 /* clang-format on */
833
834 // This is a templated function so that T can be either a char*
835 // or a string. This works because we use the [] operator to access
836 // individual characters at a time.
837 template <typename T>
HexStringToBytesInternal(const char * from,T to,size_t num)838 void HexStringToBytesInternal(const char* from, T to, size_t num) {
839 for (size_t i = 0; i < num; i++) {
840 to[i] = static_cast<char>(kHexValueLenient[from[i * 2] & 0xFF] << 4) +
841 (kHexValueLenient[from[i * 2 + 1] & 0xFF]);
842 }
843 }
844
845 // This is a templated function so that T can be either a char* or a
846 // std::string.
847 template <typename T>
BytesToHexStringInternal(const unsigned char * src,T dest,size_t num)848 void BytesToHexStringInternal(const unsigned char* src, T dest, size_t num) {
849 auto dest_ptr = &dest[0];
850 for (auto src_ptr = src; src_ptr != (src + num); ++src_ptr, dest_ptr += 2) {
851 const char* hex_p = &numbers_internal::kHexTable[*src_ptr * 2];
852 std::copy(hex_p, hex_p + 2, dest_ptr);
853 }
854 }
855
856 } // namespace
857
858 // ----------------------------------------------------------------------
859 // CUnescape()
860 //
861 // See CUnescapeInternal() for implementation details.
862 // ----------------------------------------------------------------------
CUnescape(absl::string_view source,std::string * dest,std::string * error)863 bool CUnescape(absl::string_view source, std::string* dest,
864 std::string* error) {
865 return CUnescapeInternal(source, kUnescapeNulls, dest, error);
866 }
867
CEscape(absl::string_view src)868 std::string CEscape(absl::string_view src) {
869 std::string dest;
870 CEscapeAndAppendInternal(src, &dest);
871 return dest;
872 }
873
CHexEscape(absl::string_view src)874 std::string CHexEscape(absl::string_view src) {
875 return CEscapeInternal(src, true, false);
876 }
877
Utf8SafeCEscape(absl::string_view src)878 std::string Utf8SafeCEscape(absl::string_view src) {
879 return CEscapeInternal(src, false, true);
880 }
881
Utf8SafeCHexEscape(absl::string_view src)882 std::string Utf8SafeCHexEscape(absl::string_view src) {
883 return CEscapeInternal(src, true, true);
884 }
885
Base64Unescape(absl::string_view src,std::string * dest)886 bool Base64Unescape(absl::string_view src, std::string* dest) {
887 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnBase64);
888 }
889
WebSafeBase64Unescape(absl::string_view src,std::string * dest)890 bool WebSafeBase64Unescape(absl::string_view src, std::string* dest) {
891 return Base64UnescapeInternal(src.data(), src.size(), dest, kUnWebSafeBase64);
892 }
893
Base64Escape(absl::string_view src,std::string * dest)894 void Base64Escape(absl::string_view src, std::string* dest) {
895 strings_internal::Base64EscapeInternal(
896 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
897 true, strings_internal::kBase64Chars);
898 }
899
WebSafeBase64Escape(absl::string_view src,std::string * dest)900 void WebSafeBase64Escape(absl::string_view src, std::string* dest) {
901 strings_internal::Base64EscapeInternal(
902 reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
903 false, strings_internal::kWebSafeBase64Chars);
904 }
905
Base64Escape(absl::string_view src)906 std::string Base64Escape(absl::string_view src) {
907 std::string dest;
908 strings_internal::Base64EscapeInternal(
909 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
910 true, strings_internal::kBase64Chars);
911 return dest;
912 }
913
WebSafeBase64Escape(absl::string_view src)914 std::string WebSafeBase64Escape(absl::string_view src) {
915 std::string dest;
916 strings_internal::Base64EscapeInternal(
917 reinterpret_cast<const unsigned char*>(src.data()), src.size(), &dest,
918 false, strings_internal::kWebSafeBase64Chars);
919 return dest;
920 }
921
HexStringToBytes(absl::string_view from)922 std::string HexStringToBytes(absl::string_view from) {
923 std::string result;
924 const auto num = from.size() / 2;
925 strings_internal::STLStringResizeUninitialized(&result, num);
926 absl::HexStringToBytesInternal<std::string&>(from.data(), result, num);
927 return result;
928 }
929
BytesToHexString(absl::string_view from)930 std::string BytesToHexString(absl::string_view from) {
931 std::string result;
932 strings_internal::STLStringResizeUninitialized(&result, 2 * from.size());
933 absl::BytesToHexStringInternal<std::string&>(
934 reinterpret_cast<const unsigned char*>(from.data()), result, from.size());
935 return result;
936 }
937
938 ABSL_NAMESPACE_END
939 } // namespace absl
940