1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/escape.h"
6 
7 #include <ostream>
8 
9 #include "base/check_op.h"
10 #include "base/feature_list.h"
11 #include "base/features.h"
12 #include "base/strings/string_piece.h"
13 #include "base/strings/string_util.h"
14 #include "base/strings/utf_string_conversion_utils.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "base/third_party/icu/icu_utf.h"
17 
18 namespace base {
19 
20 namespace {
21 
22 const char kHexString[] = "0123456789ABCDEF";
IntToHex(int i)23 inline char IntToHex(int i) {
24   DCHECK_GE(i, 0) << i << " not a hex value";
25   DCHECK_LE(i, 15) << i << " not a hex value";
26   return kHexString[i];
27 }
28 
29 // A fast bit-vector map for ascii characters.
30 //
31 // Internally stores 256 bits in an array of 8 ints.
32 // Does quick bit-flicking to lookup needed characters.
33 struct Charmap {
Containsbase::__anon55c048780111::Charmap34   bool Contains(unsigned char c) const {
35     return ((map[c >> 5] & (1 << (c & 31))) != 0);
36   }
37 
38   uint32_t map[8];
39 };
40 
41 // Given text to escape and a Charmap defining which values to escape,
42 // return an escaped string.  If use_plus is true, spaces are converted
43 // to +, otherwise, if spaces are in the charmap, they are converted to
44 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
45 // '%' is in the charmap, it is converted to %25.
Escape(StringPiece text,const Charmap & charmap,bool use_plus,bool keep_escaped=false)46 std::string Escape(StringPiece text,
47                    const Charmap& charmap,
48                    bool use_plus,
49                    bool keep_escaped = false) {
50   std::string escaped;
51   escaped.reserve(text.length() * 3);
52   for (size_t i = 0; i < text.length(); ++i) {
53     unsigned char c = static_cast<unsigned char>(text[i]);
54     if (use_plus && ' ' == c) {
55       escaped.push_back('+');
56     } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
57                IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) {
58       escaped.push_back('%');
59     } else if (charmap.Contains(c)) {
60       escaped.push_back('%');
61       escaped.push_back(IntToHex(c >> 4));
62       escaped.push_back(IntToHex(c & 0xf));
63     } else {
64       escaped.push_back(static_cast<char>(c));
65     }
66   }
67   return escaped;
68 }
69 
70 // Convert a character |c| to a form that will not be mistaken as HTML.
71 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)72 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
73   static constexpr struct {
74     char key;
75     StringPiece replacement;
76   } kCharsToEscape[] = {
77       {'<', "<"},   {'>', ">"},   {'&', "&"},
78       {'"', """}, {'\'', "'"},
79   };
80   for (const auto& char_to_escape : kCharsToEscape) {
81     if (c == char_to_escape.key) {
82       output->append(std::begin(char_to_escape.replacement),
83                      std::end(char_to_escape.replacement));
84       return;
85     }
86   }
87   output->push_back(c);
88 }
89 
90 // Convert |input| string to a form that will not be interpreted as HTML.
91 template <typename T, typename CharT = typename T::value_type>
EscapeForHTMLImpl(T input)92 std::basic_string<CharT> EscapeForHTMLImpl(T input) {
93   std::basic_string<CharT> result;
94   result.reserve(input.size());  // Optimize for no escaping.
95 
96   for (auto c : input) {
97     AppendEscapedCharForHTMLImpl(c, &result);
98   }
99 
100   return result;
101 }
102 
103 // Everything except alphanumerics and -._~
104 // See RFC 3986 for the list of unreserved characters.
105 static const Charmap kUnreservedCharmap = {
106     {0xffffffffL, 0xfc009fffL, 0x78000001L, 0xb8000001L, 0xffffffffL,
107      0xffffffffL, 0xffffffffL, 0xffffffffL}};
108 
109 // Everything except alphanumerics and !'()*-._~
110 // See RFC 2396 for the list of reserved characters.
111 static const Charmap kQueryCharmap = {{0xffffffffL, 0xfc00987dL, 0x78000001L,
112                                        0xb8000001L, 0xffffffffL, 0xffffffffL,
113                                        0xffffffffL, 0xffffffffL}};
114 
115 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
116 static const Charmap kPathCharmap = {{0xffffffffL, 0xd400002dL, 0x78000000L,
117                                       0xb8000001L, 0xffffffffL, 0xffffffffL,
118                                       0xffffffffL, 0xffffffffL}};
119 
120 #if BUILDFLAG(IS_APPLE)
121 // non-printable, non-7bit, and (including space)  "#%<>[\]^`{|}
122 static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L,
123                                        0xb8000001L, 0xffffffffL, 0xffffffffL,
124                                        0xffffffffL, 0xffffffffL}};
125 #endif  // BUILDFLAG(IS_APPLE)
126 
127 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
128 static const Charmap kUrlEscape = {{0xffffffffL, 0xf80008fdL, 0x78000001L,
129                                     0xb8000001L, 0xffffffffL, 0xffffffffL,
130                                     0xffffffffL, 0xffffffffL}};
131 
132 // non-7bit, as well as %.
133 static const Charmap kNonASCIICharmapAndPercent = {
134     {0x00000000L, 0x00000020L, 0x00000000L, 0x00000000L, 0xffffffffL,
135      0xffffffffL, 0xffffffffL, 0xffffffffL}};
136 
137 // non-7bit
138 static const Charmap kNonASCIICharmap = {{0x00000000L, 0x00000000L, 0x00000000L,
139                                           0x00000000L, 0xffffffffL, 0xffffffffL,
140                                           0xffffffffL, 0xffffffffL}};
141 
142 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
143 // !'()*-._~#[]
144 static const Charmap kExternalHandlerCharmap = {
145     {0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, 0xffffffffL,
146      0xffffffffL, 0xffffffffL, 0xffffffffL}};
147 
148 // Contains nonzero when the corresponding character is unescapable for normal
149 // URLs. These characters are the ones that may change the parsing of a URL, so
150 // we don't want to unescape them sometimes. In many case we won't want to
151 // unescape spaces, but that is controlled by parameters to Unescape*.
152 //
153 // The basic rule is that we can't unescape anything that would changing parsing
154 // like # or ?. We also can't unescape &, =, or + since that could be part of a
155 // query and that could change the server's parsing of the query. Nor can we
156 // unescape \ since src/url/ will convert it to a /.
157 //
158 // Lastly, we can't unescape anything that doesn't have a canonical
159 // representation in a URL. This means that unescaping will change the URL, and
160 // you could get different behavior if you copy and paste the URL, or press
161 // enter in the URL bar. The list of characters that fall into this category
162 // are the ones labeled PASS (allow either escaped or unescaped) in the big
163 // lookup table at the top of url/url_canon_path.cc.  Also, characters
164 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
165 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
166 // not unescaped, to avoid turning a valid url according to spec into an
167 // invalid one.
168 // clang-format off
169 const char kUrlUnescape[128] = {
170 //   Null, control chars...
171      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
173 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
174      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
175 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
176      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
177 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
178      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
179 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
180      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
181 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
182      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
183 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
184      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
185 };
186 // clang-format on
187 
188 // Attempts to unescape the sequence at |index| within |escaped_text|.  If
189 // successful, sets |value| to the unescaped value.  Returns whether
190 // unescaping succeeded.
UnescapeUnsignedByteAtIndex(StringPiece escaped_text,size_t index,unsigned char * value)191 bool UnescapeUnsignedByteAtIndex(StringPiece escaped_text,
192                                  size_t index,
193                                  unsigned char* value) {
194   if ((index + 2) >= escaped_text.size())
195     return false;
196   if (escaped_text[index] != '%')
197     return false;
198   char most_sig_digit(escaped_text[index + 1]);
199   char least_sig_digit(escaped_text[index + 2]);
200   if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
201     *value = static_cast<unsigned char>(HexDigitToInt(most_sig_digit) * 16 +
202                                         HexDigitToInt(least_sig_digit));
203     return true;
204   }
205   return false;
206 }
207 
208 // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
209 // the specified index. On success, returns true, sets |code_point_out| to be
210 // the character's code point and |unescaped_out| to be the unescaped UTF-8
211 // string. |unescaped_out| will always be 1/3rd the length of the substring of
212 // |escaped_text| that corresponds to the unescaped character.
UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,size_t index,base_icu::UChar32 * code_point_out,std::string * unescaped_out)213 bool UnescapeUTF8CharacterAtIndex(StringPiece escaped_text,
214                                   size_t index,
215                                   base_icu::UChar32* code_point_out,
216                                   std::string* unescaped_out) {
217   DCHECK(unescaped_out->empty());
218 
219   unsigned char bytes[CBU8_MAX_LENGTH];
220   if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0]))
221     return false;
222 
223   size_t num_bytes = 1;
224 
225   // If this is a lead byte, need to collect trail bytes as well.
226   if (CBU8_IS_LEAD(bytes[0])) {
227     // Look for the last trail byte of the UTF-8 character.  Give up once
228     // reach max character length number of bytes, or hit an unescaped
229     // character. No need to check length of escaped_text, as
230     // UnescapeUnsignedByteAtIndex checks lengths.
231     while (num_bytes < std::size(bytes) &&
232            UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3,
233                                        &bytes[num_bytes]) &&
234            CBU8_IS_TRAIL(bytes[num_bytes])) {
235       ++num_bytes;
236     }
237   }
238 
239   size_t char_index = 0;
240   // Check if the unicode "character" that was just unescaped is valid.
241   if (!ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes,
242                             &char_index, code_point_out)) {
243     return false;
244   }
245 
246   // It's possible that a prefix of |bytes| forms a valid UTF-8 character,
247   // and the rest are not valid UTF-8, so need to update |num_bytes| based
248   // on the result of ReadUnicodeCharacter().
249   num_bytes = char_index + 1;
250   *unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes);
251   return true;
252 }
253 
254 // This method takes a Unicode code point and returns true if it should be
255 // unescaped, based on |rules|.
ShouldUnescapeCodePoint(UnescapeRule::Type rules,base_icu::UChar32 code_point)256 bool ShouldUnescapeCodePoint(UnescapeRule::Type rules,
257                              base_icu::UChar32 code_point) {
258   // If this is an ASCII character, use the lookup table.
259   if (code_point >= 0 && code_point < 0x80) {
260     return kUrlUnescape[static_cast<size_t>(code_point)] ||
261            // Allow some additional unescaping when flags are set.
262            (code_point == ' ' && (rules & UnescapeRule::SPACES)) ||
263            // Allow any of the prohibited but non-control characters when doing
264            // "special" chars.
265            ((code_point == '/' || code_point == '\\') &&
266             (rules & UnescapeRule::PATH_SEPARATORS)) ||
267            (code_point > ' ' && code_point != '/' && code_point != '\\' &&
268             (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS));
269   }
270 
271   // Compare the code point against a list of characters that can be used
272   // to spoof other URLs.
273   //
274   // Can't use icu to make this cleaner, because Cronet cannot depend on
275   // icu, and currently uses this file.
276   // TODO(https://crbug.com/829873): Try to make this use icu, both to
277   // protect against regressions as the Unicode standard is updated and to
278   // reduce the number of long lists of characters.
279   return !(
280       // Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
281       // control characters are not allowed to appear unescaped in URLs.
282       code_point == 0x200E ||  // LEFT-TO-RIGHT MARK         (%E2%80%8E)
283       code_point == 0x200F ||  // RIGHT-TO-LEFT MARK         (%E2%80%8F)
284       code_point == 0x202A ||  // LEFT-TO-RIGHT EMBEDDING    (%E2%80%AA)
285       code_point == 0x202B ||  // RIGHT-TO-LEFT EMBEDDING    (%E2%80%AB)
286       code_point == 0x202C ||  // POP DIRECTIONAL FORMATTING (%E2%80%AC)
287       code_point == 0x202D ||  // LEFT-TO-RIGHT OVERRIDE     (%E2%80%AD)
288       code_point == 0x202E ||  // RIGHT-TO-LEFT OVERRIDE     (%E2%80%AE)
289 
290       // The Unicode Technical Report (TR9) as referenced by RFC 3987 above has
291       // since added some new BiDi control characters that are not safe to
292       // unescape. http://www.unicode.org/reports/tr9
293       code_point == 0x061C ||  // ARABIC LETTER MARK         (%D8%9C)
294       code_point == 0x2066 ||  // LEFT-TO-RIGHT ISOLATE      (%E2%81%A6)
295       code_point == 0x2067 ||  // RIGHT-TO-LEFT ISOLATE      (%E2%81%A7)
296       code_point == 0x2068 ||  // FIRST STRONG ISOLATE       (%E2%81%A8)
297       code_point == 0x2069 ||  // POP DIRECTIONAL ISOLATE    (%E2%81%A9)
298 
299       // The following spoofable characters are also banned in unescaped URLs,
300       // because they could be used to imitate parts of a web browser's UI.
301       code_point == 0x1F50F ||  // LOCK WITH INK PEN    (%F0%9F%94%8F)
302       code_point == 0x1F510 ||  // CLOSED LOCK WITH KEY (%F0%9F%94%90)
303       code_point == 0x1F512 ||  // LOCK                 (%F0%9F%94%92)
304       code_point == 0x1F513 ||  // OPEN LOCK            (%F0%9F%94%93)
305 
306       // Spaces are also banned, as they can be used to scroll text out of view.
307       code_point == 0x0085 ||  // NEXT LINE                  (%C2%85)
308       code_point == 0x00A0 ||  // NO-BREAK SPACE             (%C2%A0)
309       code_point == 0x1680 ||  // OGHAM SPACE MARK           (%E1%9A%80)
310       code_point == 0x2000 ||  // EN QUAD                    (%E2%80%80)
311       code_point == 0x2001 ||  // EM QUAD                    (%E2%80%81)
312       code_point == 0x2002 ||  // EN SPACE                   (%E2%80%82)
313       code_point == 0x2003 ||  // EM SPACE                   (%E2%80%83)
314       code_point == 0x2004 ||  // THREE-PER-EM SPACE         (%E2%80%84)
315       code_point == 0x2005 ||  // FOUR-PER-EM SPACE          (%E2%80%85)
316       code_point == 0x2006 ||  // SIX-PER-EM SPACE           (%E2%80%86)
317       code_point == 0x2007 ||  // FIGURE SPACE               (%E2%80%87)
318       code_point == 0x2008 ||  // PUNCTUATION SPACE          (%E2%80%88)
319       code_point == 0x2009 ||  // THIN SPACE                 (%E2%80%89)
320       code_point == 0x200A ||  // HAIR SPACE                 (%E2%80%8A)
321       code_point == 0x2028 ||  // LINE SEPARATOR             (%E2%80%A8)
322       code_point == 0x2029 ||  // PARAGRAPH SEPARATOR        (%E2%80%A9)
323       code_point == 0x202F ||  // NARROW NO-BREAK SPACE      (%E2%80%AF)
324       code_point == 0x205F ||  // MEDIUM MATHEMATICAL SPACE  (%E2%81%9F)
325       code_point == 0x3000 ||  // IDEOGRAPHIC SPACE          (%E3%80%80)
326       // U+2800 is rendered as a space, but is not considered whitespace (see
327       // crbug.com/1068531).
328       code_point == 0x2800 ||  // BRAILLE PATTERN BLANK      (%E2%A0%80)
329 
330       // Default Ignorable ([:Default_Ignorable_Code_Point=Yes:]) and Format
331       // characters ([:Cf:]) are also banned (see crbug.com/824715).
332       code_point == 0x00AD ||  // SOFT HYPHEN               (%C2%AD)
333       code_point == 0x034F ||  // COMBINING GRAPHEME JOINER (%CD%8F)
334       // Arabic number formatting
335       (code_point >= 0x0600 && code_point <= 0x0605) ||
336       // U+061C is already banned as a BiDi control character.
337       code_point == 0x06DD ||  // ARABIC END OF AYAH          (%DB%9D)
338       code_point == 0x070F ||  // SYRIAC ABBREVIATION MARK    (%DC%8F)
339       code_point == 0x08E2 ||  // ARABIC DISPUTED END OF AYAH (%E0%A3%A2)
340       code_point == 0x115F ||  // HANGUL CHOSEONG FILLER      (%E1%85%9F)
341       code_point == 0x1160 ||  // HANGUL JUNGSEONG FILLER     (%E1%85%A0)
342       code_point == 0x17B4 ||  // KHMER VOWEL INHERENT AQ     (%E1%9E%B4)
343       code_point == 0x17B5 ||  // KHMER VOWEL INHERENT AA     (%E1%9E%B5)
344       code_point == 0x180B ||  // MONGOLIAN FREE VARIATION SELECTOR ONE
345                                // (%E1%A0%8B)
346       code_point == 0x180C ||  // MONGOLIAN FREE VARIATION SELECTOR TWO
347                                // (%E1%A0%8C)
348       code_point == 0x180D ||  // MONGOLIAN FREE VARIATION SELECTOR THREE
349                                // (%E1%A0%8D)
350       code_point == 0x180E ||  // MONGOLIAN VOWEL SEPARATOR   (%E1%A0%8E)
351       code_point == 0x200B ||  // ZERO WIDTH SPACE            (%E2%80%8B)
352       code_point == 0x200C ||  // ZERO WIDTH SPACE NON-JOINER (%E2%80%8C)
353       code_point == 0x200D ||  // ZERO WIDTH JOINER           (%E2%80%8D)
354       // U+200E, U+200F, U+202A--202E, and U+2066--2069 are already banned as
355       // BiDi control characters.
356       code_point == 0x2060 ||  // WORD JOINER          (%E2%81%A0)
357       code_point == 0x2061 ||  // FUNCTION APPLICATION (%E2%81%A1)
358       code_point == 0x2062 ||  // INVISIBLE TIMES      (%E2%81%A2)
359       code_point == 0x2063 ||  // INVISIBLE SEPARATOR  (%E2%81%A3)
360       code_point == 0x2064 ||  // INVISIBLE PLUS       (%E2%81%A4)
361       code_point == 0x2065 ||  // null (%E2%81%A5)
362       // 0x2066--0x2069 are already banned as a BiDi control characters.
363       // General Punctuation - Deprecated (U+206A--206F)
364       (code_point >= 0x206A && code_point <= 0x206F) ||
365       code_point == 0x3164 ||  // HANGUL FILLER (%E3%85%A4)
366       (code_point >= 0xFFF0 && code_point <= 0xFFF8) ||  // null
367       // Variation selectors (%EF%B8%80 -- %EF%B8%8F)
368       (code_point >= 0xFE00 && code_point <= 0xFE0F) ||
369       code_point == 0xFEFF ||   // ZERO WIDTH NO-BREAK SPACE (%EF%BB%BF)
370       code_point == 0xFFA0 ||   // HALFWIDTH HANGUL FILLER (%EF%BE%A0)
371       code_point == 0xFFF9 ||   // INTERLINEAR ANNOTATION ANCHOR     (%EF%BF%B9)
372       code_point == 0xFFFA ||   // INTERLINEAR ANNOTATION SEPARATOR  (%EF%BF%BA)
373       code_point == 0xFFFB ||   // INTERLINEAR ANNOTATION TERMINATOR (%EF%BF%BB)
374       code_point == 0x110BD ||  // KAITHI NUMBER SIGN       (%F0%91%82%BD)
375       code_point == 0x110CD ||  // KAITHI NUMBER SIGN ABOVE (%F0%91%83%8D)
376       // Egyptian hieroglyph formatting (%F0%93%90%B0 -- %F0%93%90%B8)
377       (code_point >= 0x13430 && code_point <= 0x13438) ||
378       // Shorthand format controls (%F0%9B%B2%A0 -- %F0%9B%B2%A3)
379       (code_point >= 0x1BCA0 && code_point <= 0x1BCA3) ||
380       // Beams and slurs (%F0%9D%85%B3 -- %F0%9D%85%BA)
381       (code_point >= 0x1D173 && code_point <= 0x1D17A) ||
382       // Tags, Variation Selectors, nulls
383       (code_point >= 0xE0000 && code_point <= 0xE0FFF));
384 }
385 
386 // Unescapes |escaped_text| according to |rules|, returning the resulting
387 // string.  Fills in an |adjustments| parameter, if non-nullptr, so it reflects
388 // the alterations done to the string that are not one-character-to-one-
389 // character.  The resulting |adjustments| will always be sorted by increasing
390 // offset.
UnescapeURLWithAdjustmentsImpl(StringPiece escaped_text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)391 std::string UnescapeURLWithAdjustmentsImpl(
392     StringPiece escaped_text,
393     UnescapeRule::Type rules,
394     OffsetAdjuster::Adjustments* adjustments) {
395   if (adjustments)
396     adjustments->clear();
397   // Do not unescape anything, return the |escaped_text| text.
398   if (rules == UnescapeRule::NONE)
399     return std::string(escaped_text);
400 
401   // The output of the unescaping is always smaller than the input, so we can
402   // reserve the input size to make sure we have enough buffer and don't have
403   // to allocate in the loop below.
404   std::string result;
405   result.reserve(escaped_text.length());
406 
407   // Locations of adjusted text.
408   for (size_t i = 0, max = escaped_text.size(); i < max;) {
409     // Try to unescape the character.
410     base_icu::UChar32 code_point;
411     std::string unescaped;
412     if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point,
413                                       &unescaped)) {
414       // Check if the next character can be unescaped, but not as a valid UTF-8
415       // character. In that case, just unescaped and write the non-sense
416       // character.
417       //
418       // TODO(https://crbug.com/829868): Do not unescape illegal UTF-8
419       // sequences.
420       unsigned char non_utf8_byte;
421       if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) {
422         result.push_back(static_cast<char>(non_utf8_byte));
423         if (adjustments)
424           adjustments->push_back(OffsetAdjuster::Adjustment(i, 3, 1));
425         i += 3;
426         continue;
427       }
428 
429       // Character is not escaped, so append as is, unless it's a '+' and
430       // REPLACE_PLUS_WITH_SPACE is being applied.
431       if (escaped_text[i] == '+' &&
432           (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) {
433         result.push_back(' ');
434       } else {
435         result.push_back(escaped_text[i]);
436       }
437       ++i;
438       continue;
439     }
440 
441     DCHECK(!unescaped.empty());
442 
443     if (!ShouldUnescapeCodePoint(rules, code_point)) {
444       // If it's a valid UTF-8 character, but not safe to unescape, copy all
445       // bytes directly.
446       result.append(escaped_text.begin() + i,
447                     escaped_text.begin() + i + 3 * unescaped.length());
448       i += unescaped.length() * 3;
449       continue;
450     }
451 
452     // If the code point is allowed, and append the entire unescaped character.
453     result.append(unescaped);
454     if (adjustments) {
455       for (size_t j = 0; j < unescaped.length(); ++j) {
456         adjustments->push_back(OffsetAdjuster::Adjustment(i + j * 3, 3, 1));
457       }
458     }
459     i += 3 * unescaped.length();
460   }
461 
462   return result;
463 }
464 
465 }  // namespace
466 
EscapeAllExceptUnreserved(StringPiece text)467 std::string EscapeAllExceptUnreserved(StringPiece text) {
468   return Escape(text, kUnreservedCharmap, false);
469 }
470 
EscapeQueryParamValue(StringPiece text,bool use_plus)471 std::string EscapeQueryParamValue(StringPiece text, bool use_plus) {
472   return Escape(text, kQueryCharmap, use_plus);
473 }
474 
EscapePath(StringPiece path)475 std::string EscapePath(StringPiece path) {
476   return Escape(path, kPathCharmap, false);
477 }
478 
479 #if BUILDFLAG(IS_APPLE)
EscapeNSURLPrecursor(StringPiece precursor)480 std::string EscapeNSURLPrecursor(StringPiece precursor) {
481   return Escape(precursor, kNSURLCharmap, false, true);
482 }
483 #endif  // BUILDFLAG(IS_APPLE)
484 
EscapeUrlEncodedData(StringPiece path,bool use_plus)485 std::string EscapeUrlEncodedData(StringPiece path, bool use_plus) {
486   return Escape(path, kUrlEscape, use_plus);
487 }
488 
EscapeNonASCIIAndPercent(StringPiece input)489 std::string EscapeNonASCIIAndPercent(StringPiece input) {
490   return Escape(input, kNonASCIICharmapAndPercent, false);
491 }
492 
EscapeNonASCII(StringPiece input)493 std::string EscapeNonASCII(StringPiece input) {
494   return Escape(input, kNonASCIICharmap, false);
495 }
496 
EscapeExternalHandlerValue(StringPiece text)497 std::string EscapeExternalHandlerValue(StringPiece text) {
498   return Escape(text, kExternalHandlerCharmap, false, true);
499 }
500 
AppendEscapedCharForHTML(char c,std::string * output)501 void AppendEscapedCharForHTML(char c, std::string* output) {
502   AppendEscapedCharForHTMLImpl(c, output);
503 }
504 
EscapeForHTML(StringPiece input)505 std::string EscapeForHTML(StringPiece input) {
506   return EscapeForHTMLImpl(input);
507 }
508 
EscapeForHTML(StringPiece16 input)509 std::u16string EscapeForHTML(StringPiece16 input) {
510   return EscapeForHTMLImpl(input);
511 }
512 
UnescapeURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)513 std::string UnescapeURLComponent(StringPiece escaped_text,
514                                  UnescapeRule::Type rules) {
515   return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, nullptr);
516 }
517 
UnescapeAndDecodeUTF8URLComponentWithAdjustments(StringPiece text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)518 std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments(
519     StringPiece text,
520     UnescapeRule::Type rules,
521     OffsetAdjuster::Adjustments* adjustments) {
522   std::u16string result;
523   OffsetAdjuster::Adjustments unescape_adjustments;
524   std::string unescaped_url(
525       UnescapeURLWithAdjustmentsImpl(text, rules, &unescape_adjustments));
526   if (UTF8ToUTF16WithAdjustments(unescaped_url.data(), unescaped_url.length(),
527                                  &result, adjustments)) {
528     // Character set looks like it's valid.
529     if (adjustments) {
530       OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
531                                                  adjustments);
532     }
533     return result;
534   }
535   // Character set is not valid.  Return the escaped version.
536   return UTF8ToUTF16WithAdjustments(text, adjustments);
537 }
538 
UnescapeBinaryURLComponent(StringPiece escaped_text,UnescapeRule::Type rules)539 std::string UnescapeBinaryURLComponent(StringPiece escaped_text,
540                                        UnescapeRule::Type rules) {
541   // Only NORMAL and REPLACE_PLUS_WITH_SPACE are supported.
542   DCHECK(rules != UnescapeRule::NONE);
543   DCHECK(!(rules &
544            ~(UnescapeRule::NORMAL | UnescapeRule::REPLACE_PLUS_WITH_SPACE)));
545 
546   // It is not possible to read the feature state when this function is invoked
547   // before FeatureList initialization. In that case, fallback to the feature's
548   // default state.
549   //
550   // TODO(crbug.com/1321924): Cleanup this feature.
551   const bool optimize_data_urls_feature_is_enabled =
552       base::FeatureList::GetInstance()
553           ? base::FeatureList::IsEnabled(features::kOptimizeDataUrls)
554           : features::kOptimizeDataUrls.default_state ==
555                 base::FEATURE_ENABLED_BY_DEFAULT;
556 
557   // If there are no '%' characters in the string, there will be nothing to
558   // unescape, so we can take the fast path.
559   if (optimize_data_urls_feature_is_enabled &&
560       escaped_text.find('%') == StringPiece::npos) {
561     std::string unescaped_text(escaped_text);
562     if (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)
563       std::replace(unescaped_text.begin(), unescaped_text.end(), '+', ' ');
564     return unescaped_text;
565   }
566 
567   std::string unescaped_text;
568 
569   // The output of the unescaping is always smaller than the input, so we can
570   // reserve the input size to make sure we have enough buffer and don't have
571   // to allocate in the loop below.
572   // Increase capacity before size, as just resizing can grow capacity
573   // needlessly beyond our requested size.
574   unescaped_text.reserve(escaped_text.size());
575   unescaped_text.resize(escaped_text.size());
576 
577   size_t output_index = 0;
578 
579   for (size_t i = 0, max = escaped_text.size(); i < max;) {
580     unsigned char byte;
581     // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
582     // to call.
583     if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
584       unescaped_text[output_index++] = static_cast<char>(byte);
585       i += 3;
586       continue;
587     }
588 
589     if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
590         escaped_text[i] == '+') {
591       unescaped_text[output_index++] = ' ';
592       ++i;
593       continue;
594     }
595 
596     unescaped_text[output_index++] = escaped_text[i++];
597   }
598 
599   DCHECK_LE(output_index, unescaped_text.size());
600   unescaped_text.resize(output_index);
601   return unescaped_text;
602 }
603 
UnescapeBinaryURLComponentSafe(StringPiece escaped_text,bool fail_on_path_separators,std::string * unescaped_text)604 bool UnescapeBinaryURLComponentSafe(StringPiece escaped_text,
605                                     bool fail_on_path_separators,
606                                     std::string* unescaped_text) {
607   unescaped_text->clear();
608 
609   std::set<unsigned char> illegal_encoded_bytes;
610   for (unsigned char c = '\x00'; c < '\x20'; ++c) {
611     illegal_encoded_bytes.insert(c);
612   }
613   if (fail_on_path_separators) {
614     illegal_encoded_bytes.insert('/');
615     illegal_encoded_bytes.insert('\\');
616   }
617   if (ContainsEncodedBytes(escaped_text, illegal_encoded_bytes))
618     return false;
619 
620   *unescaped_text = UnescapeBinaryURLComponent(escaped_text);
621   return true;
622 }
623 
ContainsEncodedBytes(StringPiece escaped_text,const std::set<unsigned char> & bytes)624 bool ContainsEncodedBytes(StringPiece escaped_text,
625                           const std::set<unsigned char>& bytes) {
626   for (size_t i = 0, max = escaped_text.size(); i < max;) {
627     unsigned char byte;
628     // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
629     // to call.
630     if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
631       if (bytes.find(byte) != bytes.end())
632         return true;
633 
634       i += 3;
635       continue;
636     }
637 
638     ++i;
639   }
640 
641   return false;
642 }
643 
UnescapeForHTML(StringPiece16 input)644 std::u16string UnescapeForHTML(StringPiece16 input) {
645   static const struct {
646     const char* ampersand_code;
647     const char16_t replacement;
648   } kEscapeToChars[] = {
649       {"<", '<'},   {">", '>'},   {"&", '&'},
650       {""", '"'}, {"'", '\''},
651   };
652   constexpr size_t kEscapeToCharsCount = std::size(kEscapeToChars);
653 
654   if (input.find(u"&") == std::string::npos)
655     return std::u16string(input);
656 
657   std::u16string ampersand_chars[kEscapeToCharsCount];
658   std::u16string text(input);
659   for (std::u16string::iterator iter = text.begin(); iter != text.end();
660        ++iter) {
661     if (*iter == '&') {
662       // Potential ampersand encode char.
663       size_t index = static_cast<size_t>(iter - text.begin());
664       for (size_t i = 0; i < std::size(kEscapeToChars); i++) {
665         if (ampersand_chars[i].empty()) {
666           ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
667         }
668         if (text.find(ampersand_chars[i], index) == index) {
669           text.replace(
670               iter, iter + static_cast<ptrdiff_t>(ampersand_chars[i].length()),
671               1, kEscapeToChars[i].replacement);
672           break;
673         }
674       }
675     }
676   }
677   return text;
678 }
679 
680 }  // namespace base
681