1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/escape.h"
6
7 #include <ostream>
8
9 #include "base/check_op.h"
10 #include "base/feature_list.h"
11 #include "base/features.h"
12 #include "base/strings/string_piece.h"
13 #include "base/strings/string_util.h"
14 #include "base/strings/utf_string_conversion_utils.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "base/third_party/icu/icu_utf.h"
17
18 namespace base {
19
20 namespace {
21
22 const char kHexString[] = "0123456789ABCDEF";
IntToHex(int i)23 inline char IntToHex(int i) {
24 DCHECK_GE(i, 0) << i << " not a hex value";
25 DCHECK_LE(i, 15) << i << " not a hex value";
26 return kHexString[i];
27 }
28
29 // A fast bit-vector map for ascii characters.
30 //
31 // Internally stores 256 bits in an array of 8 ints.
32 // Does quick bit-flicking to lookup needed characters.
33 struct Charmap {
Containsbase::__anon55c048780111::Charmap34 bool Contains(unsigned char c) const {
35 return ((map[c >> 5] & (1 << (c & 31))) != 0);
36 }
37
38 uint32_t map[8];
39 };
40
41 // Given text to escape and a Charmap defining which values to escape,
42 // return an escaped string. If use_plus is true, spaces are converted
43 // to +, otherwise, if spaces are in the charmap, they are converted to
44 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
45 // '%' is in the charmap, it is converted to %25.
Escape(StringPiece text,const Charmap & charmap,bool use_plus,bool keep_escaped=false)46 std::string Escape(StringPiece text,
47 const Charmap& charmap,
48 bool use_plus,
49 bool keep_escaped = false) {
50 std::string escaped;
51 escaped.reserve(text.length() * 3);
52 for (size_t i = 0; i < text.length(); ++i) {
53 unsigned char c = static_cast<unsigned char>(text[i]);
54 if (use_plus && ' ' == c) {
55 escaped.push_back('+');
56 } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
57 IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) {
58 escaped.push_back('%');
59 } else if (charmap.Contains(c)) {
60 escaped.push_back('%');
61 escaped.push_back(IntToHex(c >> 4));
62 escaped.push_back(IntToHex(c & 0xf));
63 } else {
64 escaped.push_back(static_cast<char>(c));
65 }
66 }
67 return escaped;
68 }
69
70 // Convert a character |c| to a form that will not be mistaken as HTML.
71 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)72 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
73 static constexpr struct {
74 char key;
75 StringPiece replacement;
76 } kCharsToEscape[] = {
77 {'<', "<"}, {'>', ">"}, {'&', "&"},
78 {'"', """}, {'\'', "'"},
79 };
80 for (const auto& char_to_escape : kCharsToEscape) {
81 if (c == char_to_escape.key) {
82 output->append(std::begin(char_to_escape.replacement),
83 std::end(char_to_escape.replacement));
84 return;
85 }
86 }
87 output->push_back(c);
88 }
89
90 // Convert |input| string to a form that will not be interpreted as HTML.
91 template <typename T, typename CharT = typename T::value_type>
EscapeForHTMLImpl(T input)92 std::basic_string<CharT> EscapeForHTMLImpl(T input) {
93 std::basic_string<CharT> result;
94 result.reserve(input.size()); // Optimize for no escaping.
95
96 for (auto c : input) {
97 AppendEscapedCharForHTMLImpl(c, &result);
98 }
99
100 return result;
101 }
102
103 // Everything except alphanumerics and -._~
104 // See RFC 3986 for the list of unreserved characters.
105 static const Charmap kUnreservedCharmap = {
106 {0xffffffffL, 0xfc009fffL, 0x78000001L, 0xb8000001L, 0xffffffffL,
107 0xffffffffL, 0xffffffffL, 0xffffffffL}};
108
109 // Everything except alphanumerics and !'()*-._~
110 // See RFC 2396 for the list of reserved characters.
111 static const Charmap kQueryCharmap = {{0xffffffffL, 0xfc00987dL, 0x78000001L,
112 0xb8000001L, 0xffffffffL, 0xffffffffL,
113 0xffffffffL, 0xffffffffL}};
114
115 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|}
116 static const Charmap kPathCharmap = {{0xffffffffL, 0xd400002dL, 0x78000000L,
117 0xb8000001L, 0xffffffffL, 0xffffffffL,
118 0xffffffffL, 0xffffffffL}};
119
120 #if BUILDFLAG(IS_APPLE)
121 // non-printable, non-7bit, and (including space) "#%<>[\]^`{|}
122 static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L,
123 0xb8000001L, 0xffffffffL, 0xffffffffL,
124 0xffffffffL, 0xffffffffL}};
125 #endif // BUILDFLAG(IS_APPLE)
126
127 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
128 static const Charmap kUrlEscape = {{0xffffffffL, 0xf80008fdL, 0x78000001L,
129 0xb8000001L, 0xffffffffL, 0xffffffffL,
130 0xffffffffL, 0xffffffffL}};
131
132 // non-7bit, as well as %.
133 static const Charmap kNonASCIICharmapAndPercent = {
134 {0x00000000L, 0x00000020L, 0x00000000L, 0x00000000L, 0xffffffffL,
135 0xffffffffL, 0xffffffffL, 0xffffffffL}};
136
137 // non-7bit
138 static const Charmap kNonASCIICharmap = {{0x00000000L, 0x00000000L, 0x00000000L,
139 0x00000000L, 0xffffffffL, 0xffffffffL,
140 0xffffffffL, 0xffffffffL}};
141
142 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
143 // !'()*-._~#[]
144 static const Charmap kExternalHandlerCharmap = {
145 {0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, 0xffffffffL,
146 0xffffffffL, 0xffffffffL, 0xffffffffL}};
147
148 // Contains nonzero when the corresponding character is unescapable for normal
149 // URLs. These characters are the ones that may change the parsing of a URL, so
150 // we don't want to unescape them sometimes. In many case we won't want to
151 // unescape spaces, but that is controlled by parameters to Unescape*.
152 //
153 // The basic rule is that we can't unescape anything that would changing parsing
154 // like # or ?. We also can't unescape &, =, or + since that could be part of a
155 // query and that could change the server's parsing of the query. Nor can we
156 // unescape \ since src/url/ will convert it to a /.
157 //
158 // Lastly, we can't unescape anything that doesn't have a canonical
159 // representation in a URL. This means that unescaping will change the URL, and
160 // you could get different behavior if you copy and paste the URL, or press
161 // enter in the URL bar. The list of characters that fall into this category
162 // are the ones labeled PASS (allow either escaped or unescaped) in the big
163 // lookup table at the top of url/url_canon_path.cc. Also, characters
164 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
165 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
166 // not unescaped, to avoid turning a valid url according to spec into an
167 // invalid one.
168 // clang-format off
169 const char kUrlUnescape[128] = {
170 // Null, control chars...
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
173 // ' ' ! " # $ % & ' ( ) * + , - . /
174 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
175 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
176 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
177 // @ A B C D E F G H I J K L M N O
178 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
179 // P Q R S T U V W X Y Z [ \ ] ^ _
180 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
181 // ` a b c d e f g h i j k l m n o
182 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
183 // p q r s t u v w x y z { | } ~ <NBSP>
184 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
185 };
186 // clang-format on
187
188 // Attempts to unescape the sequence at |index| within |escaped_text|. If
189 // successful, sets |value| to the unescaped value. Returns whether
190 // unescaping succeeded.
UnescapeUnsignedByteAtIndex(StringPiece escaped_text,size_t index,unsigned char * value)191 bool UnescapeUnsignedByteAtIndex(StringPiece escaped_text,
192 size_t index,
193 unsigned char* value) {
194 if ((index + 2) >= escaped_text.size())
195 return false;
196 if (escaped_text[index] != '%')
197 return false;
198 char most_sig_digit(escaped_text[index + 1]);
199 char least_sig_digit(escaped_text[index + 2]);
200 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
201 *value = static_cast<unsigned char>(HexDigitToInt(most_sig_digit) * 16 +
202 HexDigitToInt(least_sig_digit));
203 return true;
204 }
205 return false;
206 }
207
208 // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
209 // the specified index. On success, returns true, sets |code_point_out| to be
210 // the character's code point and |unescaped_out| to be the unescaped UTF-8
211 // string. |unescaped_out| will always be 1/3rd the length of the substring of
212 // |escaped_text| that corresponds to the unescaped character.
UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,size_t index,base_icu::UChar32 * code_point_out,std::string * unescaped_out)213 bool UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,
214 size_t index,
215 base_icu::UChar32* code_point_out,
216 std::string* unescaped_out) {
217 DCHECK(unescaped_out->empty());
218
219 unsigned char bytes[CBU8_MAX_LENGTH];
220 if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0]))
221 return false;
222
223 size_t num_bytes = 1;
224
225 // If this is a lead byte, need to collect trail bytes as well.
226 if (CBU8_IS_LEAD(bytes[0])) {
227 // Look for the last trail byte of the UTF-8 character. Give up once
228 // reach max character length number of bytes, or hit an unescaped
229 // character. No need to check length of escaped_text, as
230 // UnescapeUnsignedByteAtIndex checks lengths.
231 while (num_bytes < std::size(bytes) &&
232 UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3,
233 &bytes[num_bytes]) &&
234 CBU8_IS_TRAIL(bytes[num_bytes])) {
235 ++num_bytes;
236 }
237 }
238
239 size_t char_index = 0;
240 // Check if the unicode "character" that was just unescaped is valid.
241 if (!ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes,
242 &char_index, code_point_out)) {
243 return false;
244 }
245
246 // It's possible that a prefix of |bytes| forms a valid UTF-8 character,
247 // and the rest are not valid UTF-8, so need to update |num_bytes| based
248 // on the result of ReadUnicodeCharacter().
249 num_bytes = char_index + 1;
250 *unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes);
251 return true;
252 }
253
254 // This method takes a Unicode code point and returns true if it should be
255 // unescaped, based on |rules|.
ShouldUnescapeCodePoint(UnescapeRule::Type rules,base_icu::UChar32 code_point)256 bool ShouldUnescapeCodePoint(UnescapeRule::Type rules,
257 base_icu::UChar32 code_point) {
258 // If this is an ASCII character, use the lookup table.
259 if (code_point >= 0 && code_point < 0x80) {
260 return kUrlUnescape[static_cast<size_t>(code_point)] ||
261 // Allow some additional unescaping when flags are set.
262 (code_point == ' ' && (rules & UnescapeRule::SPACES)) ||
263 // Allow any of the prohibited but non-control characters when doing
264 // "special" chars.
265 ((code_point == '/' || code_point == '\\') &&
266 (rules & UnescapeRule::PATH_SEPARATORS)) ||
267 (code_point > ' ' && code_point != '/' && code_point != '\\' &&
268 (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS));
269 }
270
271 // Compare the code point against a list of characters that can be used
272 // to spoof other URLs.
273 //
274 // Can't use icu to make this cleaner, because Cronet cannot depend on
275 // icu, and currently uses this file.
276 // TODO(https://crbug.com/829873): Try to make this use icu, both to
277 // protect against regressions as the Unicode standard is updated and to
278 // reduce the number of long lists of characters.
279 return !(
280 // Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
281 // control characters are not allowed to appear unescaped in URLs.
282 code_point == 0x200E || // LEFT-TO-RIGHT MARK (%E2%80%8E)
283 code_point == 0x200F || // RIGHT-TO-LEFT MARK (%E2%80%8F)
284 code_point == 0x202A || // LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
285 code_point == 0x202B || // RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
286 code_point == 0x202C || // POP DIRECTIONAL FORMATTING (%E2%80%AC)
287 code_point == 0x202D || // LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
288 code_point == 0x202E || // RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
289
290 // The Unicode Technical Report (TR9) as referenced by RFC 3987 above has
291 // since added some new BiDi control characters that are not safe to
292 // unescape. http://www.unicode.org/reports/tr9
293 code_point == 0x061C || // ARABIC LETTER MARK (%D8%9C)
294 code_point == 0x2066 || // LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
295 code_point == 0x2067 || // RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
296 code_point == 0x2068 || // FIRST STRONG ISOLATE (%E2%81%A8)
297 code_point == 0x2069 || // POP DIRECTIONAL ISOLATE (%E2%81%A9)
298
299 // The following spoofable characters are also banned in unescaped URLs,
300 // because they could be used to imitate parts of a web browser's UI.
301 code_point == 0x1F50F || // LOCK WITH INK PEN (%F0%9F%94%8F)
302 code_point == 0x1F510 || // CLOSED LOCK WITH KEY (%F0%9F%94%90)
303 code_point == 0x1F512 || // LOCK (%F0%9F%94%92)
304 code_point == 0x1F513 || // OPEN LOCK (%F0%9F%94%93)
305
306 // Spaces are also banned, as they can be used to scroll text out of view.
307 code_point == 0x0085 || // NEXT LINE (%C2%85)
308 code_point == 0x00A0 || // NO-BREAK SPACE (%C2%A0)
309 code_point == 0x1680 || // OGHAM SPACE MARK (%E1%9A%80)
310 code_point == 0x2000 || // EN QUAD (%E2%80%80)
311 code_point == 0x2001 || // EM QUAD (%E2%80%81)
312 code_point == 0x2002 || // EN SPACE (%E2%80%82)
313 code_point == 0x2003 || // EM SPACE (%E2%80%83)
314 code_point == 0x2004 || // THREE-PER-EM SPACE (%E2%80%84)
315 code_point == 0x2005 || // FOUR-PER-EM SPACE (%E2%80%85)
316 code_point == 0x2006 || // SIX-PER-EM SPACE (%E2%80%86)
317 code_point == 0x2007 || // FIGURE SPACE (%E2%80%87)
318 code_point == 0x2008 || // PUNCTUATION SPACE (%E2%80%88)
319 code_point == 0x2009 || // THIN SPACE (%E2%80%89)
320 code_point == 0x200A || // HAIR SPACE (%E2%80%8A)
321 code_point == 0x2028 || // LINE SEPARATOR (%E2%80%A8)
322 code_point == 0x2029 || // PARAGRAPH SEPARATOR (%E2%80%A9)
323 code_point == 0x202F || // NARROW NO-BREAK SPACE (%E2%80%AF)
324 code_point == 0x205F || // MEDIUM MATHEMATICAL SPACE (%E2%81%9F)
325 code_point == 0x3000 || // IDEOGRAPHIC SPACE (%E3%80%80)
326 // U+2800 is rendered as a space, but is not considered whitespace (see
327 // crbug.com/1068531).
328 code_point == 0x2800 || // BRAILLE PATTERN BLANK (%E2%A0%80)
329
330 // Default Ignorable ([:Default_Ignorable_Code_Point=Yes:]) and Format
331 // characters ([:Cf:]) are also banned (see crbug.com/824715).
332 code_point == 0x00AD || // SOFT HYPHEN (%C2%AD)
333 code_point == 0x034F || // COMBINING GRAPHEME JOINER (%CD%8F)
334 // Arabic number formatting
335 (code_point >= 0x0600 && code_point <= 0x0605) ||
336 // U+061C is already banned as a BiDi control character.
337 code_point == 0x06DD || // ARABIC END OF AYAH (%DB%9D)
338 code_point == 0x070F || // SYRIAC ABBREVIATION MARK (%DC%8F)
339 code_point == 0x08E2 || // ARABIC DISPUTED END OF AYAH (%E0%A3%A2)
340 code_point == 0x115F || // HANGUL CHOSEONG FILLER (%E1%85%9F)
341 code_point == 0x1160 || // HANGUL JUNGSEONG FILLER (%E1%85%A0)
342 code_point == 0x17B4 || // KHMER VOWEL INHERENT AQ (%E1%9E%B4)
343 code_point == 0x17B5 || // KHMER VOWEL INHERENT AA (%E1%9E%B5)
344 code_point == 0x180B || // MONGOLIAN FREE VARIATION SELECTOR ONE
345 // (%E1%A0%8B)
346 code_point == 0x180C || // MONGOLIAN FREE VARIATION SELECTOR TWO
347 // (%E1%A0%8C)
348 code_point == 0x180D || // MONGOLIAN FREE VARIATION SELECTOR THREE
349 // (%E1%A0%8D)
350 code_point == 0x180E || // MONGOLIAN VOWEL SEPARATOR (%E1%A0%8E)
351 code_point == 0x200B || // ZERO WIDTH SPACE (%E2%80%8B)
352 code_point == 0x200C || // ZERO WIDTH SPACE NON-JOINER (%E2%80%8C)
353 code_point == 0x200D || // ZERO WIDTH JOINER (%E2%80%8D)
354 // U+200E, U+200F, U+202A--202E, and U+2066--2069 are already banned as
355 // BiDi control characters.
356 code_point == 0x2060 || // WORD JOINER (%E2%81%A0)
357 code_point == 0x2061 || // FUNCTION APPLICATION (%E2%81%A1)
358 code_point == 0x2062 || // INVISIBLE TIMES (%E2%81%A2)
359 code_point == 0x2063 || // INVISIBLE SEPARATOR (%E2%81%A3)
360 code_point == 0x2064 || // INVISIBLE PLUS (%E2%81%A4)
361 code_point == 0x2065 || // null (%E2%81%A5)
362 // 0x2066--0x2069 are already banned as a BiDi control characters.
363 // General Punctuation - Deprecated (U+206A--206F)
364 (code_point >= 0x206A && code_point <= 0x206F) ||
365 code_point == 0x3164 || // HANGUL FILLER (%E3%85%A4)
366 (code_point >= 0xFFF0 && code_point <= 0xFFF8) || // null
367 // Variation selectors (%EF%B8%80 -- %EF%B8%8F)
368 (code_point >= 0xFE00 && code_point <= 0xFE0F) ||
369 code_point == 0xFEFF || // ZERO WIDTH NO-BREAK SPACE (%EF%BB%BF)
370 code_point == 0xFFA0 || // HALFWIDTH HANGUL FILLER (%EF%BE%A0)
371 code_point == 0xFFF9 || // INTERLINEAR ANNOTATION ANCHOR (%EF%BF%B9)
372 code_point == 0xFFFA || // INTERLINEAR ANNOTATION SEPARATOR (%EF%BF%BA)
373 code_point == 0xFFFB || // INTERLINEAR ANNOTATION TERMINATOR (%EF%BF%BB)
374 code_point == 0x110BD || // KAITHI NUMBER SIGN (%F0%91%82%BD)
375 code_point == 0x110CD || // KAITHI NUMBER SIGN ABOVE (%F0%91%83%8D)
376 // Egyptian hieroglyph formatting (%F0%93%90%B0 -- %F0%93%90%B8)
377 (code_point >= 0x13430 && code_point <= 0x13438) ||
378 // Shorthand format controls (%F0%9B%B2%A0 -- %F0%9B%B2%A3)
379 (code_point >= 0x1BCA0 && code_point <= 0x1BCA3) ||
380 // Beams and slurs (%F0%9D%85%B3 -- %F0%9D%85%BA)
381 (code_point >= 0x1D173 && code_point <= 0x1D17A) ||
382 // Tags, Variation Selectors, nulls
383 (code_point >= 0xE0000 && code_point <= 0xE0FFF));
384 }
385
386 // Unescapes |escaped_text| according to |rules|, returning the resulting
387 // string. Fills in an |adjustments| parameter, if non-nullptr, so it reflects
388 // the alterations done to the string that are not one-character-to-one-
389 // character. The resulting |adjustments| will always be sorted by increasing
390 // offset.
UnescapeURLWithAdjustmentsImpl(StringPiece escaped_text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)391 std::string UnescapeURLWithAdjustmentsImpl(
392 StringPiece escaped_text,
393 UnescapeRule::Type rules,
394 OffsetAdjuster::Adjustments* adjustments) {
395 if (adjustments)
396 adjustments->clear();
397 // Do not unescape anything, return the |escaped_text| text.
398 if (rules == UnescapeRule::NONE)
399 return std::string(escaped_text);
400
401 // The output of the unescaping is always smaller than the input, so we can
402 // reserve the input size to make sure we have enough buffer and don't have
403 // to allocate in the loop below.
404 std::string result;
405 result.reserve(escaped_text.length());
406
407 // Locations of adjusted text.
408 for (size_t i = 0, max = escaped_text.size(); i < max;) {
409 // Try to unescape the character.
410 base_icu::UChar32 code_point;
411 std::string unescaped;
412 if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point,
413 &unescaped)) {
414 // Check if the next character can be unescaped, but not as a valid UTF-8
415 // character. In that case, just unescaped and write the non-sense
416 // character.
417 //
418 // TODO(https://crbug.com/829868): Do not unescape illegal UTF-8
419 // sequences.
420 unsigned char non_utf8_byte;
421 if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) {
422 result.push_back(static_cast<char>(non_utf8_byte));
423 if (adjustments)
424 adjustments->push_back(OffsetAdjuster::Adjustment(i, 3, 1));
425 i += 3;
426 continue;
427 }
428
429 // Character is not escaped, so append as is, unless it's a '+' and
430 // REPLACE_PLUS_WITH_SPACE is being applied.
431 if (escaped_text[i] == '+' &&
432 (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) {
433 result.push_back(' ');
434 } else {
435 result.push_back(escaped_text[i]);
436 }
437 ++i;
438 continue;
439 }
440
441 DCHECK(!unescaped.empty());
442
443 if (!ShouldUnescapeCodePoint(rules, code_point)) {
444 // If it's a valid UTF-8 character, but not safe to unescape, copy all
445 // bytes directly.
446 result.append(escaped_text.begin() + i,
447 escaped_text.begin() + i + 3 * unescaped.length());
448 i += unescaped.length() * 3;
449 continue;
450 }
451
452 // If the code point is allowed, and append the entire unescaped character.
453 result.append(unescaped);
454 if (adjustments) {
455 for (size_t j = 0; j < unescaped.length(); ++j) {
456 adjustments->push_back(OffsetAdjuster::Adjustment(i + j * 3, 3, 1));
457 }
458 }
459 i += 3 * unescaped.length();
460 }
461
462 return result;
463 }
464
465 } // namespace
466
EscapeAllExceptUnreserved(StringPiece text)467 std::string EscapeAllExceptUnreserved(StringPiece text) {
468 return Escape(text, kUnreservedCharmap, false);
469 }
470
EscapeQueryParamValue(StringPiece text,bool use_plus)471 std::string EscapeQueryParamValue(StringPiece text, bool use_plus) {
472 return Escape(text, kQueryCharmap, use_plus);
473 }
474
EscapePath(StringPiece path)475 std::string EscapePath(StringPiece path) {
476 return Escape(path, kPathCharmap, false);
477 }
478
479 #if BUILDFLAG(IS_APPLE)
EscapeNSURLPrecursor(StringPiece precursor)480 std::string EscapeNSURLPrecursor(StringPiece precursor) {
481 return Escape(precursor, kNSURLCharmap, false, true);
482 }
483 #endif // BUILDFLAG(IS_APPLE)
484
EscapeUrlEncodedData(StringPiece path,bool use_plus)485 std::string EscapeUrlEncodedData(StringPiece path, bool use_plus) {
486 return Escape(path, kUrlEscape, use_plus);
487 }
488
EscapeNonASCIIAndPercent(StringPiece input)489 std::string EscapeNonASCIIAndPercent(StringPiece input) {
490 return Escape(input, kNonASCIICharmapAndPercent, false);
491 }
492
EscapeNonASCII(StringPiece input)493 std::string EscapeNonASCII(StringPiece input) {
494 return Escape(input, kNonASCIICharmap, false);
495 }
496
EscapeExternalHandlerValue(StringPiece text)497 std::string EscapeExternalHandlerValue(StringPiece text) {
498 return Escape(text, kExternalHandlerCharmap, false, true);
499 }
500
AppendEscapedCharForHTML(char c,std::string * output)501 void AppendEscapedCharForHTML(char c, std::string* output) {
502 AppendEscapedCharForHTMLImpl(c, output);
503 }
504
EscapeForHTML(StringPiece input)505 std::string EscapeForHTML(StringPiece input) {
506 return EscapeForHTMLImpl(input);
507 }
508
EscapeForHTML(StringPiece16 input)509 std::u16string EscapeForHTML(StringPiece16 input) {
510 return EscapeForHTMLImpl(input);
511 }
512
UnescapeURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)513 std::string UnescapeURLComponent(StringPiece escaped_text,
514 UnescapeRule::Type rules) {
515 return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, nullptr);
516 }
517
UnescapeAndDecodeUTF8URLComponentWithAdjustments(StringPiece text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)518 std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments(
519 StringPiece text,
520 UnescapeRule::Type rules,
521 OffsetAdjuster::Adjustments* adjustments) {
522 std::u16string result;
523 OffsetAdjuster::Adjustments unescape_adjustments;
524 std::string unescaped_url(
525 UnescapeURLWithAdjustmentsImpl(text, rules, &unescape_adjustments));
526 if (UTF8ToUTF16WithAdjustments(unescaped_url.data(), unescaped_url.length(),
527 &result, adjustments)) {
528 // Character set looks like it's valid.
529 if (adjustments) {
530 OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
531 adjustments);
532 }
533 return result;
534 }
535 // Character set is not valid. Return the escaped version.
536 return UTF8ToUTF16WithAdjustments(text, adjustments);
537 }
538
UnescapeBinaryURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)539 std::string UnescapeBinaryURLComponent(StringPiece escaped_text,
540 UnescapeRule::Type rules) {
541 // Only NORMAL and REPLACE_PLUS_WITH_SPACE are supported.
542 DCHECK(rules != UnescapeRule::NONE);
543 DCHECK(!(rules &
544 ~(UnescapeRule::NORMAL | UnescapeRule::REPLACE_PLUS_WITH_SPACE)));
545
546 // It is not possible to read the feature state when this function is invoked
547 // before FeatureList initialization. In that case, fallback to the feature's
548 // default state.
549 //
550 // TODO(crbug.com/1321924): Cleanup this feature.
551 const bool optimize_data_urls_feature_is_enabled =
552 base::FeatureList::GetInstance()
553 ? base::FeatureList::IsEnabled(features::kOptimizeDataUrls)
554 : features::kOptimizeDataUrls.default_state ==
555 base::FEATURE_ENABLED_BY_DEFAULT;
556
557 // If there are no '%' characters in the string, there will be nothing to
558 // unescape, so we can take the fast path.
559 if (optimize_data_urls_feature_is_enabled &&
560 escaped_text.find('%') == StringPiece::npos) {
561 std::string unescaped_text(escaped_text);
562 if (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)
563 std::replace(unescaped_text.begin(), unescaped_text.end(), '+', ' ');
564 return unescaped_text;
565 }
566
567 std::string unescaped_text;
568
569 // The output of the unescaping is always smaller than the input, so we can
570 // reserve the input size to make sure we have enough buffer and don't have
571 // to allocate in the loop below.
572 // Increase capacity before size, as just resizing can grow capacity
573 // needlessly beyond our requested size.
574 unescaped_text.reserve(escaped_text.size());
575 unescaped_text.resize(escaped_text.size());
576
577 size_t output_index = 0;
578
579 for (size_t i = 0, max = escaped_text.size(); i < max;) {
580 unsigned char byte;
581 // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
582 // to call.
583 if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
584 unescaped_text[output_index++] = static_cast<char>(byte);
585 i += 3;
586 continue;
587 }
588
589 if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
590 escaped_text[i] == '+') {
591 unescaped_text[output_index++] = ' ';
592 ++i;
593 continue;
594 }
595
596 unescaped_text[output_index++] = escaped_text[i++];
597 }
598
599 DCHECK_LE(output_index, unescaped_text.size());
600 unescaped_text.resize(output_index);
601 return unescaped_text;
602 }
603
UnescapeBinaryURLComponentSafe(StringPiece escaped_text,bool fail_on_path_separators,std::string * unescaped_text)604 bool UnescapeBinaryURLComponentSafe(StringPiece escaped_text,
605 bool fail_on_path_separators,
606 std::string* unescaped_text) {
607 unescaped_text->clear();
608
609 std::set<unsigned char> illegal_encoded_bytes;
610 for (unsigned char c = '\x00'; c < '\x20'; ++c) {
611 illegal_encoded_bytes.insert(c);
612 }
613 if (fail_on_path_separators) {
614 illegal_encoded_bytes.insert('/');
615 illegal_encoded_bytes.insert('\\');
616 }
617 if (ContainsEncodedBytes(escaped_text, illegal_encoded_bytes))
618 return false;
619
620 *unescaped_text = UnescapeBinaryURLComponent(escaped_text);
621 return true;
622 }
623
ContainsEncodedBytes(StringPiece escaped_text,const std::set<unsigned char> & bytes)624 bool ContainsEncodedBytes(StringPiece escaped_text,
625 const std::set<unsigned char>& bytes) {
626 for (size_t i = 0, max = escaped_text.size(); i < max;) {
627 unsigned char byte;
628 // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
629 // to call.
630 if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
631 if (bytes.find(byte) != bytes.end())
632 return true;
633
634 i += 3;
635 continue;
636 }
637
638 ++i;
639 }
640
641 return false;
642 }
643
UnescapeForHTML(StringPiece16 input)644 std::u16string UnescapeForHTML(StringPiece16 input) {
645 static const struct {
646 const char* ampersand_code;
647 const char16_t replacement;
648 } kEscapeToChars[] = {
649 {"<", '<'}, {">", '>'}, {"&", '&'},
650 {""", '"'}, {"'", '\''},
651 };
652 constexpr size_t kEscapeToCharsCount = std::size(kEscapeToChars);
653
654 if (input.find(u"&") == std::string::npos)
655 return std::u16string(input);
656
657 std::u16string ampersand_chars[kEscapeToCharsCount];
658 std::u16string text(input);
659 for (std::u16string::iterator iter = text.begin(); iter != text.end();
660 ++iter) {
661 if (*iter == '&') {
662 // Potential ampersand encode char.
663 size_t index = static_cast<size_t>(iter - text.begin());
664 for (size_t i = 0; i < std::size(kEscapeToChars); i++) {
665 if (ampersand_chars[i].empty()) {
666 ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
667 }
668 if (text.find(ampersand_chars[i], index) == index) {
669 text.replace(
670 iter, iter + static_cast<ptrdiff_t>(ampersand_chars[i].length()),
671 1, kEscapeToChars[i].replacement);
672 break;
673 }
674 }
675 }
676 }
677 return text;
678 }
679
680 } // namespace base
681