1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9
10 #include "base/strings/escape.h"
11
12 #include <ostream>
13 #include <string_view>
14
15 #include "base/check_op.h"
16 #include "base/strings/string_number_conversions.h"
17 #include "base/strings/string_util.h"
18 #include "base/strings/utf_string_conversion_utils.h"
19 #include "base/strings/utf_string_conversions.h"
20 #include "base/third_party/icu/icu_utf.h"
21
22 namespace base {
23
24 namespace {
25
26 // A fast bit-vector map for ascii characters.
27 //
28 // Internally stores 256 bits in an array of 8 ints.
29 // Does quick bit-flicking to lookup needed characters.
30 struct Charmap {
Containsbase::__anon1d205f130111::Charmap31 bool Contains(unsigned char c) const {
32 return ((map[c >> 5] & (1 << (c & 31))) != 0);
33 }
34
35 uint32_t map[8];
36 };
37
38 // Given text to escape and a Charmap defining which values to escape,
39 // return an escaped string. If use_plus is true, spaces are converted
40 // to +, otherwise, if spaces are in the charmap, they are converted to
41 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
42 // '%' is in the charmap, it is converted to %25.
Escape(std::string_view text,const Charmap & charmap,bool use_plus,bool keep_escaped=false)43 std::string Escape(std::string_view text,
44 const Charmap& charmap,
45 bool use_plus,
46 bool keep_escaped = false) {
47 std::string escaped;
48 escaped.reserve(text.length() * 3);
49 for (size_t i = 0; i < text.length(); ++i) {
50 unsigned char c = static_cast<unsigned char>(text[i]);
51 if (use_plus && ' ' == c) {
52 escaped.push_back('+');
53 } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
54 IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) {
55 escaped.push_back('%');
56 } else if (charmap.Contains(c)) {
57 escaped.push_back('%');
58 AppendHexEncodedByte(c, escaped);
59 } else {
60 escaped.push_back(static_cast<char>(c));
61 }
62 }
63 return escaped;
64 }
65
66 // Convert a character |c| to a form that will not be mistaken as HTML.
67 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)68 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
69 static constexpr struct {
70 char key;
71 std::string_view replacement;
72 } kCharsToEscape[] = {
73 {'<', "<"}, {'>', ">"}, {'&', "&"},
74 {'"', """}, {'\'', "'"},
75 };
76 for (const auto& char_to_escape : kCharsToEscape) {
77 if (c == char_to_escape.key) {
78 output->append(std::begin(char_to_escape.replacement),
79 std::end(char_to_escape.replacement));
80 return;
81 }
82 }
83 output->push_back(c);
84 }
85
86 // Convert |input| string to a form that will not be interpreted as HTML.
87 template <typename T, typename CharT = typename T::value_type>
EscapeForHTMLImpl(T input)88 std::basic_string<CharT> EscapeForHTMLImpl(T input) {
89 std::basic_string<CharT> result;
90 result.reserve(input.size()); // Optimize for no escaping.
91
92 for (auto c : input) {
93 AppendEscapedCharForHTMLImpl(c, &result);
94 }
95
96 return result;
97 }
98
99 // Everything except alphanumerics and -._~
100 // See RFC 3986 for the list of unreserved characters.
101 static const Charmap kUnreservedCharmap = {
102 {0xffffffffL, 0xfc009fffL, 0x78000001L, 0xb8000001L, 0xffffffffL,
103 0xffffffffL, 0xffffffffL, 0xffffffffL}};
104
105 // Everything except alphanumerics and !'()*-._~
106 // See RFC 2396 for the list of reserved characters.
107 static const Charmap kQueryCharmap = {{0xffffffffL, 0xfc00987dL, 0x78000001L,
108 0xb8000001L, 0xffffffffL, 0xffffffffL,
109 0xffffffffL, 0xffffffffL}};
110
111 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|}
112 static const Charmap kPathCharmap = {{0xffffffffL, 0xd400002dL, 0x78000000L,
113 0xb8000001L, 0xffffffffL, 0xffffffffL,
114 0xffffffffL, 0xffffffffL}};
115
116 #if BUILDFLAG(IS_APPLE)
117 // non-printable, non-7bit, and (including space) "#%<>[\]^`{|}
118 static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L,
119 0xb8000001L, 0xffffffffL, 0xffffffffL,
120 0xffffffffL, 0xffffffffL}};
121 #endif // BUILDFLAG(IS_APPLE)
122
123 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
124 static const Charmap kUrlEscape = {{0xffffffffL, 0xf80008fdL, 0x78000001L,
125 0xb8000001L, 0xffffffffL, 0xffffffffL,
126 0xffffffffL, 0xffffffffL}};
127
128 // non-7bit, as well as %.
129 static const Charmap kNonASCIICharmapAndPercent = {
130 {0x00000000L, 0x00000020L, 0x00000000L, 0x00000000L, 0xffffffffL,
131 0xffffffffL, 0xffffffffL, 0xffffffffL}};
132
133 // non-7bit
134 static const Charmap kNonASCIICharmap = {{0x00000000L, 0x00000000L, 0x00000000L,
135 0x00000000L, 0xffffffffL, 0xffffffffL,
136 0xffffffffL, 0xffffffffL}};
137
138 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
139 // !'()*-._~#[]
140 static const Charmap kExternalHandlerCharmap = {
141 {0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, 0xffffffffL,
142 0xffffffffL, 0xffffffffL, 0xffffffffL}};
143
144 // Contains nonzero when the corresponding character is unescapable for normal
145 // URLs. These characters are the ones that may change the parsing of a URL, so
146 // we don't want to unescape them sometimes. In many case we won't want to
147 // unescape spaces, but that is controlled by parameters to Unescape*.
148 //
149 // The basic rule is that we can't unescape anything that would changing parsing
150 // like # or ?. We also can't unescape &, =, or + since that could be part of a
151 // query and that could change the server's parsing of the query. Nor can we
152 // unescape \ since src/url/ will convert it to a /.
153 //
154 // Lastly, we can't unescape anything that doesn't have a canonical
155 // representation in a URL. This means that unescaping will change the URL, and
156 // you could get different behavior if you copy and paste the URL, or press
157 // enter in the URL bar. The list of characters that fall into this category
158 // are the ones labeled PASS (allow either escaped or unescaped) in the big
159 // lookup table at the top of url/url_canon_path.cc. Also, characters
160 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
161 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
162 // not unescaped, to avoid turning a valid url according to spec into an
163 // invalid one.
164 // clang-format off
165 const char kUrlUnescape[128] = {
166 // Null, control chars...
167 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169 // ' ' ! " # $ % & ' ( ) * + , - . /
170 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
171 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
173 // @ A B C D E F G H I J K L M N O
174 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
175 // P Q R S T U V W X Y Z [ \ ] ^ _
176 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
177 // ` a b c d e f g h i j k l m n o
178 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
179 // p q r s t u v w x y z { | } ~ <NBSP>
180 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
181 };
182 // clang-format on
183
184 // Attempts to unescape the sequence at |index| within |escaped_text|. If
185 // successful, sets |value| to the unescaped value. Returns whether
186 // unescaping succeeded.
UnescapeUnsignedByteAtIndex(std::string_view escaped_text,size_t index,unsigned char * value)187 bool UnescapeUnsignedByteAtIndex(std::string_view escaped_text,
188 size_t index,
189 unsigned char* value) {
190 if ((index + 2) >= escaped_text.size())
191 return false;
192 if (escaped_text[index] != '%')
193 return false;
194 char most_sig_digit(escaped_text[index + 1]);
195 char least_sig_digit(escaped_text[index + 2]);
196 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
197 *value = static_cast<unsigned char>(HexDigitToInt(most_sig_digit) * 16 +
198 HexDigitToInt(least_sig_digit));
199 return true;
200 }
201 return false;
202 }
203
204 // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
205 // the specified index. On success, returns true, sets |code_point_out| to be
206 // the character's code point and |unescaped_out| to be the unescaped UTF-8
207 // string. |unescaped_out| will always be 1/3rd the length of the substring of
208 // |escaped_text| that corresponds to the unescaped character.
UnescapeUTF8CharacterAtIndex(std::string_view escaped_text,size_t index,base_icu::UChar32 * code_point_out,std::string * unescaped_out)209 bool UnescapeUTF8CharacterAtIndex(std::string_view escaped_text,
210 size_t index,
211 base_icu::UChar32* code_point_out,
212 std::string* unescaped_out) {
213 DCHECK(unescaped_out->empty());
214
215 unsigned char bytes[CBU8_MAX_LENGTH];
216 if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0]))
217 return false;
218
219 size_t num_bytes = 1;
220
221 // If this is a lead byte, need to collect trail bytes as well.
222 if (CBU8_IS_LEAD(bytes[0])) {
223 // Look for the last trail byte of the UTF-8 character. Give up once
224 // reach max character length number of bytes, or hit an unescaped
225 // character. No need to check length of escaped_text, as
226 // UnescapeUnsignedByteAtIndex checks lengths.
227 while (num_bytes < std::size(bytes) &&
228 UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3,
229 &bytes[num_bytes]) &&
230 CBU8_IS_TRAIL(bytes[num_bytes])) {
231 ++num_bytes;
232 }
233 }
234
235 size_t char_index = 0;
236 // Check if the unicode "character" that was just unescaped is valid.
237 if (!ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes,
238 &char_index, code_point_out)) {
239 return false;
240 }
241
242 // It's possible that a prefix of |bytes| forms a valid UTF-8 character,
243 // and the rest are not valid UTF-8, so need to update |num_bytes| based
244 // on the result of ReadUnicodeCharacter().
245 num_bytes = char_index + 1;
246 *unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes);
247 return true;
248 }
249
250 // This method takes a Unicode code point and returns true if it should be
251 // unescaped, based on |rules|.
ShouldUnescapeCodePoint(UnescapeRule::Type rules,base_icu::UChar32 code_point)252 bool ShouldUnescapeCodePoint(UnescapeRule::Type rules,
253 base_icu::UChar32 code_point) {
254 // If this is an ASCII character, use the lookup table.
255 if (code_point >= 0 && code_point < 0x80) {
256 return kUrlUnescape[static_cast<size_t>(code_point)] ||
257 // Allow some additional unescaping when flags are set.
258 (code_point == ' ' && (rules & UnescapeRule::SPACES)) ||
259 // Allow any of the prohibited but non-control characters when doing
260 // "special" chars.
261 ((code_point == '/' || code_point == '\\') &&
262 (rules & UnescapeRule::PATH_SEPARATORS)) ||
263 (code_point > ' ' && code_point != '/' && code_point != '\\' &&
264 (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS));
265 }
266
267 // Compare the code point against a list of characters that can be used
268 // to spoof other URLs.
269 //
270 // Can't use icu to make this cleaner, because Cronet cannot depend on
271 // icu, and currently uses this file.
272 // TODO(crbug.com/41381359): Try to make this use icu, both to
273 // protect against regressions as the Unicode standard is updated and to
274 // reduce the number of long lists of characters.
275 return !(
276 // Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
277 // control characters are not allowed to appear unescaped in URLs.
278 code_point == 0x200E || // LEFT-TO-RIGHT MARK (%E2%80%8E)
279 code_point == 0x200F || // RIGHT-TO-LEFT MARK (%E2%80%8F)
280 code_point == 0x202A || // LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
281 code_point == 0x202B || // RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
282 code_point == 0x202C || // POP DIRECTIONAL FORMATTING (%E2%80%AC)
283 code_point == 0x202D || // LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
284 code_point == 0x202E || // RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
285
286 // The Unicode Technical Report (TR9) as referenced by RFC 3987 above has
287 // since added some new BiDi control characters that are not safe to
288 // unescape. http://www.unicode.org/reports/tr9
289 code_point == 0x061C || // ARABIC LETTER MARK (%D8%9C)
290 code_point == 0x2066 || // LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
291 code_point == 0x2067 || // RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
292 code_point == 0x2068 || // FIRST STRONG ISOLATE (%E2%81%A8)
293 code_point == 0x2069 || // POP DIRECTIONAL ISOLATE (%E2%81%A9)
294
295 // The following spoofable characters are also banned in unescaped URLs,
296 // because they could be used to imitate parts of a web browser's UI.
297 code_point == 0x1F50F || // LOCK WITH INK PEN (%F0%9F%94%8F)
298 code_point == 0x1F510 || // CLOSED LOCK WITH KEY (%F0%9F%94%90)
299 code_point == 0x1F512 || // LOCK (%F0%9F%94%92)
300 code_point == 0x1F513 || // OPEN LOCK (%F0%9F%94%93)
301
302 // Spaces are also banned, as they can be used to scroll text out of view.
303 code_point == 0x0085 || // NEXT LINE (%C2%85)
304 code_point == 0x00A0 || // NO-BREAK SPACE (%C2%A0)
305 code_point == 0x1680 || // OGHAM SPACE MARK (%E1%9A%80)
306 code_point == 0x2000 || // EN QUAD (%E2%80%80)
307 code_point == 0x2001 || // EM QUAD (%E2%80%81)
308 code_point == 0x2002 || // EN SPACE (%E2%80%82)
309 code_point == 0x2003 || // EM SPACE (%E2%80%83)
310 code_point == 0x2004 || // THREE-PER-EM SPACE (%E2%80%84)
311 code_point == 0x2005 || // FOUR-PER-EM SPACE (%E2%80%85)
312 code_point == 0x2006 || // SIX-PER-EM SPACE (%E2%80%86)
313 code_point == 0x2007 || // FIGURE SPACE (%E2%80%87)
314 code_point == 0x2008 || // PUNCTUATION SPACE (%E2%80%88)
315 code_point == 0x2009 || // THIN SPACE (%E2%80%89)
316 code_point == 0x200A || // HAIR SPACE (%E2%80%8A)
317 code_point == 0x2028 || // LINE SEPARATOR (%E2%80%A8)
318 code_point == 0x2029 || // PARAGRAPH SEPARATOR (%E2%80%A9)
319 code_point == 0x202F || // NARROW NO-BREAK SPACE (%E2%80%AF)
320 code_point == 0x205F || // MEDIUM MATHEMATICAL SPACE (%E2%81%9F)
321 code_point == 0x3000 || // IDEOGRAPHIC SPACE (%E3%80%80)
322 // U+2800 is rendered as a space, but is not considered whitespace (see
323 // crbug.com/1068531).
324 code_point == 0x2800 || // BRAILLE PATTERN BLANK (%E2%A0%80)
325
326 // Default Ignorable ([:Default_Ignorable_Code_Point=Yes:]) and Format
327 // characters ([:Cf:]) are also banned (see crbug.com/824715).
328 code_point == 0x00AD || // SOFT HYPHEN (%C2%AD)
329 code_point == 0x034F || // COMBINING GRAPHEME JOINER (%CD%8F)
330 // Arabic number formatting
331 (code_point >= 0x0600 && code_point <= 0x0605) ||
332 // U+061C is already banned as a BiDi control character.
333 code_point == 0x06DD || // ARABIC END OF AYAH (%DB%9D)
334 code_point == 0x070F || // SYRIAC ABBREVIATION MARK (%DC%8F)
335 code_point == 0x08E2 || // ARABIC DISPUTED END OF AYAH (%E0%A3%A2)
336 code_point == 0x115F || // HANGUL CHOSEONG FILLER (%E1%85%9F)
337 code_point == 0x1160 || // HANGUL JUNGSEONG FILLER (%E1%85%A0)
338 code_point == 0x17B4 || // KHMER VOWEL INHERENT AQ (%E1%9E%B4)
339 code_point == 0x17B5 || // KHMER VOWEL INHERENT AA (%E1%9E%B5)
340 code_point == 0x180B || // MONGOLIAN FREE VARIATION SELECTOR ONE
341 // (%E1%A0%8B)
342 code_point == 0x180C || // MONGOLIAN FREE VARIATION SELECTOR TWO
343 // (%E1%A0%8C)
344 code_point == 0x180D || // MONGOLIAN FREE VARIATION SELECTOR THREE
345 // (%E1%A0%8D)
346 code_point == 0x180E || // MONGOLIAN VOWEL SEPARATOR (%E1%A0%8E)
347 code_point == 0x200B || // ZERO WIDTH SPACE (%E2%80%8B)
348 code_point == 0x200C || // ZERO WIDTH SPACE NON-JOINER (%E2%80%8C)
349 code_point == 0x200D || // ZERO WIDTH JOINER (%E2%80%8D)
350 // U+200E, U+200F, U+202A--202E, and U+2066--2069 are already banned as
351 // BiDi control characters.
352 code_point == 0x2060 || // WORD JOINER (%E2%81%A0)
353 code_point == 0x2061 || // FUNCTION APPLICATION (%E2%81%A1)
354 code_point == 0x2062 || // INVISIBLE TIMES (%E2%81%A2)
355 code_point == 0x2063 || // INVISIBLE SEPARATOR (%E2%81%A3)
356 code_point == 0x2064 || // INVISIBLE PLUS (%E2%81%A4)
357 code_point == 0x2065 || // null (%E2%81%A5)
358 // 0x2066--0x2069 are already banned as a BiDi control characters.
359 // General Punctuation - Deprecated (U+206A--206F)
360 (code_point >= 0x206A && code_point <= 0x206F) ||
361 code_point == 0x3164 || // HANGUL FILLER (%E3%85%A4)
362 (code_point >= 0xFFF0 && code_point <= 0xFFF8) || // null
363 // Variation selectors (%EF%B8%80 -- %EF%B8%8F)
364 (code_point >= 0xFE00 && code_point <= 0xFE0F) ||
365 code_point == 0xFEFF || // ZERO WIDTH NO-BREAK SPACE (%EF%BB%BF)
366 code_point == 0xFFA0 || // HALFWIDTH HANGUL FILLER (%EF%BE%A0)
367 code_point == 0xFFF9 || // INTERLINEAR ANNOTATION ANCHOR (%EF%BF%B9)
368 code_point == 0xFFFA || // INTERLINEAR ANNOTATION SEPARATOR (%EF%BF%BA)
369 code_point == 0xFFFB || // INTERLINEAR ANNOTATION TERMINATOR (%EF%BF%BB)
370 code_point == 0x110BD || // KAITHI NUMBER SIGN (%F0%91%82%BD)
371 code_point == 0x110CD || // KAITHI NUMBER SIGN ABOVE (%F0%91%83%8D)
372 // Egyptian hieroglyph formatting (%F0%93%90%B0 -- %F0%93%90%B8)
373 (code_point >= 0x13430 && code_point <= 0x13438) ||
374 // Shorthand format controls (%F0%9B%B2%A0 -- %F0%9B%B2%A3)
375 (code_point >= 0x1BCA0 && code_point <= 0x1BCA3) ||
376 // Beams and slurs (%F0%9D%85%B3 -- %F0%9D%85%BA)
377 (code_point >= 0x1D173 && code_point <= 0x1D17A) ||
378 // Tags, Variation Selectors, nulls
379 (code_point >= 0xE0000 && code_point <= 0xE0FFF));
380 }
381
382 // Unescapes |escaped_text| according to |rules|, returning the resulting
383 // string. Fills in an |adjustments| parameter, if non-nullptr, so it reflects
384 // the alterations done to the string that are not one-character-to-one-
385 // character. The resulting |adjustments| will always be sorted by increasing
386 // offset.
UnescapeURLWithAdjustmentsImpl(std::string_view escaped_text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)387 std::string UnescapeURLWithAdjustmentsImpl(
388 std::string_view escaped_text,
389 UnescapeRule::Type rules,
390 OffsetAdjuster::Adjustments* adjustments) {
391 if (adjustments)
392 adjustments->clear();
393 // Do not unescape anything, return the |escaped_text| text.
394 if (rules == UnescapeRule::NONE)
395 return std::string(escaped_text);
396
397 // The output of the unescaping is always smaller than the input, so we can
398 // reserve the input size to make sure we have enough buffer and don't have
399 // to allocate in the loop below.
400 std::string result;
401 result.reserve(escaped_text.length());
402
403 // Locations of adjusted text.
404 for (size_t i = 0, max = escaped_text.size(); i < max;) {
405 // Try to unescape the character.
406 base_icu::UChar32 code_point;
407 std::string unescaped;
408 if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point,
409 &unescaped)) {
410 // Check if the next character can be unescaped, but not as a valid UTF-8
411 // character. In that case, just unescaped and write the non-sense
412 // character.
413 //
414 // TODO(crbug.com/40570496): Do not unescape illegal UTF-8
415 // sequences.
416 unsigned char non_utf8_byte;
417 if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) {
418 result.push_back(static_cast<char>(non_utf8_byte));
419 if (adjustments)
420 adjustments->push_back(OffsetAdjuster::Adjustment(i, 3, 1));
421 i += 3;
422 continue;
423 }
424
425 // Character is not escaped, so append as is, unless it's a '+' and
426 // REPLACE_PLUS_WITH_SPACE is being applied.
427 if (escaped_text[i] == '+' &&
428 (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) {
429 result.push_back(' ');
430 } else {
431 result.push_back(escaped_text[i]);
432 }
433 ++i;
434 continue;
435 }
436
437 DCHECK(!unescaped.empty());
438
439 if (!ShouldUnescapeCodePoint(rules, code_point)) {
440 // If it's a valid UTF-8 character, but not safe to unescape, copy all
441 // bytes directly.
442 result.append(escaped_text.substr(i, 3 * unescaped.length()));
443 i += unescaped.length() * 3;
444 continue;
445 }
446
447 // If the code point is allowed, and append the entire unescaped character.
448 result.append(unescaped);
449 if (adjustments) {
450 for (size_t j = 0; j < unescaped.length(); ++j) {
451 adjustments->push_back(OffsetAdjuster::Adjustment(i + j * 3, 3, 1));
452 }
453 }
454 i += 3 * unescaped.length();
455 }
456
457 return result;
458 }
459
460 } // namespace
461
EscapeAllExceptUnreserved(std::string_view text)462 std::string EscapeAllExceptUnreserved(std::string_view text) {
463 return Escape(text, kUnreservedCharmap, false);
464 }
465
EscapeQueryParamValue(std::string_view text,bool use_plus)466 std::string EscapeQueryParamValue(std::string_view text, bool use_plus) {
467 return Escape(text, kQueryCharmap, use_plus);
468 }
469
EscapePath(std::string_view path)470 std::string EscapePath(std::string_view path) {
471 return Escape(path, kPathCharmap, false);
472 }
473
474 #if BUILDFLAG(IS_APPLE)
EscapeNSURLPrecursor(std::string_view precursor)475 std::string EscapeNSURLPrecursor(std::string_view precursor) {
476 return Escape(precursor, kNSURLCharmap, false, true);
477 }
478 #endif // BUILDFLAG(IS_APPLE)
479
EscapeUrlEncodedData(std::string_view path,bool use_plus)480 std::string EscapeUrlEncodedData(std::string_view path, bool use_plus) {
481 return Escape(path, kUrlEscape, use_plus);
482 }
483
EscapeNonASCIIAndPercent(std::string_view input)484 std::string EscapeNonASCIIAndPercent(std::string_view input) {
485 return Escape(input, kNonASCIICharmapAndPercent, false);
486 }
487
EscapeNonASCII(std::string_view input)488 std::string EscapeNonASCII(std::string_view input) {
489 return Escape(input, kNonASCIICharmap, false);
490 }
491
EscapeExternalHandlerValue(std::string_view text)492 std::string EscapeExternalHandlerValue(std::string_view text) {
493 return Escape(text, kExternalHandlerCharmap, false, true);
494 }
495
AppendEscapedCharForHTML(char c,std::string * output)496 void AppendEscapedCharForHTML(char c, std::string* output) {
497 AppendEscapedCharForHTMLImpl(c, output);
498 }
499
EscapeForHTML(std::string_view input)500 std::string EscapeForHTML(std::string_view input) {
501 return EscapeForHTMLImpl(input);
502 }
503
EscapeForHTML(std::u16string_view input)504 std::u16string EscapeForHTML(std::u16string_view input) {
505 return EscapeForHTMLImpl(input);
506 }
507
UnescapeURLComponent(std::string_view escaped_text,UnescapeRule::Type rules)508 std::string UnescapeURLComponent(std::string_view escaped_text,
509 UnescapeRule::Type rules) {
510 return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, nullptr);
511 }
512
UnescapeAndDecodeUTF8URLComponentWithAdjustments(std::string_view text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)513 std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments(
514 std::string_view text,
515 UnescapeRule::Type rules,
516 OffsetAdjuster::Adjustments* adjustments) {
517 std::u16string result;
518 OffsetAdjuster::Adjustments unescape_adjustments;
519 std::string unescaped_url(
520 UnescapeURLWithAdjustmentsImpl(text, rules, &unescape_adjustments));
521 if (UTF8ToUTF16WithAdjustments(unescaped_url.data(), unescaped_url.length(),
522 &result, adjustments)) {
523 // Character set looks like it's valid.
524 if (adjustments) {
525 OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
526 adjustments);
527 }
528 return result;
529 }
530 // Character set is not valid. Return the escaped version.
531 return UTF8ToUTF16WithAdjustments(text, adjustments);
532 }
533
UnescapeBinaryURLComponent(std::string_view escaped_text,UnescapeRule::Type rules)534 std::string UnescapeBinaryURLComponent(std::string_view escaped_text,
535 UnescapeRule::Type rules) {
536 // Only NORMAL and REPLACE_PLUS_WITH_SPACE are supported.
537 DCHECK(rules != UnescapeRule::NONE);
538 DCHECK(!(rules &
539 ~(UnescapeRule::NORMAL | UnescapeRule::REPLACE_PLUS_WITH_SPACE)));
540
541 // If there are no '%' characters in the string, there will be nothing to
542 // unescape, so we can take the fast path.
543 if (escaped_text.find('%') == std::string_view::npos) {
544 std::string unescaped_text(escaped_text);
545 if (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)
546 std::replace(unescaped_text.begin(), unescaped_text.end(), '+', ' ');
547 return unescaped_text;
548 }
549
550 std::string unescaped_text;
551
552 // The output of the unescaping is always smaller than the input, so we can
553 // reserve the input size to make sure we have enough buffer and don't have
554 // to allocate in the loop below.
555 // Increase capacity before size, as just resizing can grow capacity
556 // needlessly beyond our requested size.
557 unescaped_text.reserve(escaped_text.size());
558 unescaped_text.resize(escaped_text.size());
559
560 size_t output_index = 0;
561
562 for (size_t i = 0, max = escaped_text.size(); i < max;) {
563 unsigned char byte;
564 // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
565 // to call.
566 if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
567 unescaped_text[output_index++] = static_cast<char>(byte);
568 i += 3;
569 continue;
570 }
571
572 if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
573 escaped_text[i] == '+') {
574 unescaped_text[output_index++] = ' ';
575 ++i;
576 continue;
577 }
578
579 unescaped_text[output_index++] = escaped_text[i++];
580 }
581
582 DCHECK_LE(output_index, unescaped_text.size());
583 unescaped_text.resize(output_index);
584 return unescaped_text;
585 }
586
UnescapeBinaryURLComponentSafe(std::string_view escaped_text,bool fail_on_path_separators,std::string * unescaped_text)587 bool UnescapeBinaryURLComponentSafe(std::string_view escaped_text,
588 bool fail_on_path_separators,
589 std::string* unescaped_text) {
590 unescaped_text->clear();
591
592 std::set<unsigned char> illegal_encoded_bytes;
593 for (unsigned char c = '\x00'; c < '\x20'; ++c) {
594 illegal_encoded_bytes.insert(c);
595 }
596 if (fail_on_path_separators) {
597 illegal_encoded_bytes.insert('/');
598 illegal_encoded_bytes.insert('\\');
599 }
600 if (ContainsEncodedBytes(escaped_text, illegal_encoded_bytes))
601 return false;
602
603 *unescaped_text = UnescapeBinaryURLComponent(escaped_text);
604 return true;
605 }
606
ContainsEncodedBytes(std::string_view escaped_text,const std::set<unsigned char> & bytes)607 bool ContainsEncodedBytes(std::string_view escaped_text,
608 const std::set<unsigned char>& bytes) {
609 for (size_t i = 0, max = escaped_text.size(); i < max;) {
610 unsigned char byte;
611 // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
612 // to call.
613 if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
614 if (bytes.find(byte) != bytes.end())
615 return true;
616
617 i += 3;
618 continue;
619 }
620
621 ++i;
622 }
623
624 return false;
625 }
626
UnescapeForHTML(std::u16string_view input)627 std::u16string UnescapeForHTML(std::u16string_view input) {
628 static const struct {
629 const char* ampersand_code;
630 const char16_t replacement;
631 } kEscapeToChars[] = {
632 {"<", '<'}, {">", '>'}, {"&", '&'},
633 {""", '"'}, {"'", '\''},
634 };
635 constexpr size_t kEscapeToCharsCount = std::size(kEscapeToChars);
636
637 if (input.find(u"&") == std::string::npos)
638 return std::u16string(input);
639
640 std::u16string ampersand_chars[kEscapeToCharsCount];
641 std::u16string text(input);
642 for (std::u16string::iterator iter = text.begin(); iter != text.end();
643 ++iter) {
644 if (*iter == '&') {
645 // Potential ampersand encode char.
646 size_t index = static_cast<size_t>(iter - text.begin());
647 for (size_t i = 0; i < std::size(kEscapeToChars); i++) {
648 if (ampersand_chars[i].empty()) {
649 ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
650 }
651 if (text.find(ampersand_chars[i], index) == index) {
652 text.replace(
653 iter, iter + static_cast<ptrdiff_t>(ampersand_chars[i].length()),
654 1, kEscapeToChars[i].replacement);
655 break;
656 }
657 }
658 }
659 }
660 return text;
661 }
662
663 } // namespace base
664