• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #include "base/strings/escape.h"
11 
12 #include <ostream>
13 #include <string_view>
14 
15 #include "base/check_op.h"
16 #include "base/strings/string_number_conversions.h"
17 #include "base/strings/string_util.h"
18 #include "base/strings/utf_string_conversion_utils.h"
19 #include "base/strings/utf_string_conversions.h"
20 #include "base/third_party/icu/icu_utf.h"
21 
22 namespace base {
23 
24 namespace {
25 
26 // A fast bit-vector map for ascii characters.
27 //
28 // Internally stores 256 bits in an array of 8 ints.
29 // Does quick bit-flicking to lookup needed characters.
30 struct Charmap {
Containsbase::__anon1d205f130111::Charmap31   bool Contains(unsigned char c) const {
32     return ((map[c >> 5] & (1 << (c & 31))) != 0);
33   }
34 
35   uint32_t map[8];
36 };
37 
38 // Given text to escape and a Charmap defining which values to escape,
39 // return an escaped string.  If use_plus is true, spaces are converted
40 // to +, otherwise, if spaces are in the charmap, they are converted to
41 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
42 // '%' is in the charmap, it is converted to %25.
Escape(std::string_view text,const Charmap & charmap,bool use_plus,bool keep_escaped=false)43 std::string Escape(std::string_view text,
44                    const Charmap& charmap,
45                    bool use_plus,
46                    bool keep_escaped = false) {
47   std::string escaped;
48   escaped.reserve(text.length() * 3);
49   for (size_t i = 0; i < text.length(); ++i) {
50     unsigned char c = static_cast<unsigned char>(text[i]);
51     if (use_plus && ' ' == c) {
52       escaped.push_back('+');
53     } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
54                IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) {
55       escaped.push_back('%');
56     } else if (charmap.Contains(c)) {
57       escaped.push_back('%');
58       AppendHexEncodedByte(c, escaped);
59     } else {
60       escaped.push_back(static_cast<char>(c));
61     }
62   }
63   return escaped;
64 }
65 
66 // Convert a character |c| to a form that will not be mistaken as HTML.
67 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)68 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
69   static constexpr struct {
70     char key;
71     std::string_view replacement;
72   } kCharsToEscape[] = {
73       {'<', "&lt;"},   {'>', "&gt;"},   {'&', "&amp;"},
74       {'"', "&quot;"}, {'\'', "&#39;"},
75   };
76   for (const auto& char_to_escape : kCharsToEscape) {
77     if (c == char_to_escape.key) {
78       output->append(std::begin(char_to_escape.replacement),
79                      std::end(char_to_escape.replacement));
80       return;
81     }
82   }
83   output->push_back(c);
84 }
85 
86 // Convert |input| string to a form that will not be interpreted as HTML.
87 template <typename T, typename CharT = typename T::value_type>
EscapeForHTMLImpl(T input)88 std::basic_string<CharT> EscapeForHTMLImpl(T input) {
89   std::basic_string<CharT> result;
90   result.reserve(input.size());  // Optimize for no escaping.
91 
92   for (auto c : input) {
93     AppendEscapedCharForHTMLImpl(c, &result);
94   }
95 
96   return result;
97 }
98 
99 // Everything except alphanumerics and -._~
100 // See RFC 3986 for the list of unreserved characters.
101 static const Charmap kUnreservedCharmap = {
102     {0xffffffffL, 0xfc009fffL, 0x78000001L, 0xb8000001L, 0xffffffffL,
103      0xffffffffL, 0xffffffffL, 0xffffffffL}};
104 
105 // Everything except alphanumerics and !'()*-._~
106 // See RFC 2396 for the list of reserved characters.
107 static const Charmap kQueryCharmap = {{0xffffffffL, 0xfc00987dL, 0x78000001L,
108                                        0xb8000001L, 0xffffffffL, 0xffffffffL,
109                                        0xffffffffL, 0xffffffffL}};
110 
111 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
112 static const Charmap kPathCharmap = {{0xffffffffL, 0xd400002dL, 0x78000000L,
113                                       0xb8000001L, 0xffffffffL, 0xffffffffL,
114                                       0xffffffffL, 0xffffffffL}};
115 
116 #if BUILDFLAG(IS_APPLE)
117 // non-printable, non-7bit, and (including space)  "#%<>[\]^`{|}
118 static const Charmap kNSURLCharmap = {{0xffffffffL, 0x5000002dL, 0x78000000L,
119                                        0xb8000001L, 0xffffffffL, 0xffffffffL,
120                                        0xffffffffL, 0xffffffffL}};
121 #endif  // BUILDFLAG(IS_APPLE)
122 
123 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
124 static const Charmap kUrlEscape = {{0xffffffffL, 0xf80008fdL, 0x78000001L,
125                                     0xb8000001L, 0xffffffffL, 0xffffffffL,
126                                     0xffffffffL, 0xffffffffL}};
127 
128 // non-7bit, as well as %.
129 static const Charmap kNonASCIICharmapAndPercent = {
130     {0x00000000L, 0x00000020L, 0x00000000L, 0x00000000L, 0xffffffffL,
131      0xffffffffL, 0xffffffffL, 0xffffffffL}};
132 
133 // non-7bit
134 static const Charmap kNonASCIICharmap = {{0x00000000L, 0x00000000L, 0x00000000L,
135                                           0x00000000L, 0xffffffffL, 0xffffffffL,
136                                           0xffffffffL, 0xffffffffL}};
137 
138 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
139 // !'()*-._~#[]
140 static const Charmap kExternalHandlerCharmap = {
141     {0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L, 0xffffffffL,
142      0xffffffffL, 0xffffffffL, 0xffffffffL}};
143 
144 // Contains nonzero when the corresponding character is unescapable for normal
145 // URLs. These characters are the ones that may change the parsing of a URL, so
146 // we don't want to unescape them sometimes. In many case we won't want to
147 // unescape spaces, but that is controlled by parameters to Unescape*.
148 //
149 // The basic rule is that we can't unescape anything that would changing parsing
150 // like # or ?. We also can't unescape &, =, or + since that could be part of a
151 // query and that could change the server's parsing of the query. Nor can we
152 // unescape \ since src/url/ will convert it to a /.
153 //
154 // Lastly, we can't unescape anything that doesn't have a canonical
155 // representation in a URL. This means that unescaping will change the URL, and
156 // you could get different behavior if you copy and paste the URL, or press
157 // enter in the URL bar. The list of characters that fall into this category
158 // are the ones labeled PASS (allow either escaped or unescaped) in the big
159 // lookup table at the top of url/url_canon_path.cc.  Also, characters
160 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
161 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
162 // not unescaped, to avoid turning a valid url according to spec into an
163 // invalid one.
164 // clang-format off
165 const char kUrlUnescape[128] = {
166 //   Null, control chars...
167      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
168      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
170      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
171 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
172      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
173 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
174      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
175 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
176      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
177 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
178      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
179 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
180      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
181 };
182 // clang-format on
183 
184 // Attempts to unescape the sequence at |index| within |escaped_text|.  If
185 // successful, sets |value| to the unescaped value.  Returns whether
186 // unescaping succeeded.
UnescapeUnsignedByteAtIndex(std::string_view escaped_text,size_t index,unsigned char * value)187 bool UnescapeUnsignedByteAtIndex(std::string_view escaped_text,
188                                  size_t index,
189                                  unsigned char* value) {
190   if ((index + 2) >= escaped_text.size())
191     return false;
192   if (escaped_text[index] != '%')
193     return false;
194   char most_sig_digit(escaped_text[index + 1]);
195   char least_sig_digit(escaped_text[index + 2]);
196   if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
197     *value = static_cast<unsigned char>(HexDigitToInt(most_sig_digit) * 16 +
198                                         HexDigitToInt(least_sig_digit));
199     return true;
200   }
201   return false;
202 }
203 
204 // Attempts to unescape and decode a UTF-8-encoded percent-escaped character at
205 // the specified index. On success, returns true, sets |code_point_out| to be
206 // the character's code point and |unescaped_out| to be the unescaped UTF-8
207 // string. |unescaped_out| will always be 1/3rd the length of the substring of
208 // |escaped_text| that corresponds to the unescaped character.
UnescapeUTF8CharacterAtIndex(std::string_view escaped_text,size_t index,base_icu::UChar32 * code_point_out,std::string * unescaped_out)209 bool UnescapeUTF8CharacterAtIndex(std::string_view escaped_text,
210                                   size_t index,
211                                   base_icu::UChar32* code_point_out,
212                                   std::string* unescaped_out) {
213   DCHECK(unescaped_out->empty());
214 
215   unsigned char bytes[CBU8_MAX_LENGTH];
216   if (!UnescapeUnsignedByteAtIndex(escaped_text, index, &bytes[0]))
217     return false;
218 
219   size_t num_bytes = 1;
220 
221   // If this is a lead byte, need to collect trail bytes as well.
222   if (CBU8_IS_LEAD(bytes[0])) {
223     // Look for the last trail byte of the UTF-8 character.  Give up once
224     // reach max character length number of bytes, or hit an unescaped
225     // character. No need to check length of escaped_text, as
226     // UnescapeUnsignedByteAtIndex checks lengths.
227     while (num_bytes < std::size(bytes) &&
228            UnescapeUnsignedByteAtIndex(escaped_text, index + num_bytes * 3,
229                                        &bytes[num_bytes]) &&
230            CBU8_IS_TRAIL(bytes[num_bytes])) {
231       ++num_bytes;
232     }
233   }
234 
235   size_t char_index = 0;
236   // Check if the unicode "character" that was just unescaped is valid.
237   if (!ReadUnicodeCharacter(reinterpret_cast<char*>(bytes), num_bytes,
238                             &char_index, code_point_out)) {
239     return false;
240   }
241 
242   // It's possible that a prefix of |bytes| forms a valid UTF-8 character,
243   // and the rest are not valid UTF-8, so need to update |num_bytes| based
244   // on the result of ReadUnicodeCharacter().
245   num_bytes = char_index + 1;
246   *unescaped_out = std::string(reinterpret_cast<char*>(bytes), num_bytes);
247   return true;
248 }
249 
250 // This method takes a Unicode code point and returns true if it should be
251 // unescaped, based on |rules|.
ShouldUnescapeCodePoint(UnescapeRule::Type rules,base_icu::UChar32 code_point)252 bool ShouldUnescapeCodePoint(UnescapeRule::Type rules,
253                              base_icu::UChar32 code_point) {
254   // If this is an ASCII character, use the lookup table.
255   if (code_point >= 0 && code_point < 0x80) {
256     return kUrlUnescape[static_cast<size_t>(code_point)] ||
257            // Allow some additional unescaping when flags are set.
258            (code_point == ' ' && (rules & UnescapeRule::SPACES)) ||
259            // Allow any of the prohibited but non-control characters when doing
260            // "special" chars.
261            ((code_point == '/' || code_point == '\\') &&
262             (rules & UnescapeRule::PATH_SEPARATORS)) ||
263            (code_point > ' ' && code_point != '/' && code_point != '\\' &&
264             (rules & UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS));
265   }
266 
267   // Compare the code point against a list of characters that can be used
268   // to spoof other URLs.
269   //
270   // Can't use icu to make this cleaner, because Cronet cannot depend on
271   // icu, and currently uses this file.
272   // TODO(crbug.com/41381359): Try to make this use icu, both to
273   // protect against regressions as the Unicode standard is updated and to
274   // reduce the number of long lists of characters.
275   return !(
276       // Per http://tools.ietf.org/html/rfc3987#section-4.1, certain BiDi
277       // control characters are not allowed to appear unescaped in URLs.
278       code_point == 0x200E ||  // LEFT-TO-RIGHT MARK         (%E2%80%8E)
279       code_point == 0x200F ||  // RIGHT-TO-LEFT MARK         (%E2%80%8F)
280       code_point == 0x202A ||  // LEFT-TO-RIGHT EMBEDDING    (%E2%80%AA)
281       code_point == 0x202B ||  // RIGHT-TO-LEFT EMBEDDING    (%E2%80%AB)
282       code_point == 0x202C ||  // POP DIRECTIONAL FORMATTING (%E2%80%AC)
283       code_point == 0x202D ||  // LEFT-TO-RIGHT OVERRIDE     (%E2%80%AD)
284       code_point == 0x202E ||  // RIGHT-TO-LEFT OVERRIDE     (%E2%80%AE)
285 
286       // The Unicode Technical Report (TR9) as referenced by RFC 3987 above has
287       // since added some new BiDi control characters that are not safe to
288       // unescape. http://www.unicode.org/reports/tr9
289       code_point == 0x061C ||  // ARABIC LETTER MARK         (%D8%9C)
290       code_point == 0x2066 ||  // LEFT-TO-RIGHT ISOLATE      (%E2%81%A6)
291       code_point == 0x2067 ||  // RIGHT-TO-LEFT ISOLATE      (%E2%81%A7)
292       code_point == 0x2068 ||  // FIRST STRONG ISOLATE       (%E2%81%A8)
293       code_point == 0x2069 ||  // POP DIRECTIONAL ISOLATE    (%E2%81%A9)
294 
295       // The following spoofable characters are also banned in unescaped URLs,
296       // because they could be used to imitate parts of a web browser's UI.
297       code_point == 0x1F50F ||  // LOCK WITH INK PEN    (%F0%9F%94%8F)
298       code_point == 0x1F510 ||  // CLOSED LOCK WITH KEY (%F0%9F%94%90)
299       code_point == 0x1F512 ||  // LOCK                 (%F0%9F%94%92)
300       code_point == 0x1F513 ||  // OPEN LOCK            (%F0%9F%94%93)
301 
302       // Spaces are also banned, as they can be used to scroll text out of view.
303       code_point == 0x0085 ||  // NEXT LINE                  (%C2%85)
304       code_point == 0x00A0 ||  // NO-BREAK SPACE             (%C2%A0)
305       code_point == 0x1680 ||  // OGHAM SPACE MARK           (%E1%9A%80)
306       code_point == 0x2000 ||  // EN QUAD                    (%E2%80%80)
307       code_point == 0x2001 ||  // EM QUAD                    (%E2%80%81)
308       code_point == 0x2002 ||  // EN SPACE                   (%E2%80%82)
309       code_point == 0x2003 ||  // EM SPACE                   (%E2%80%83)
310       code_point == 0x2004 ||  // THREE-PER-EM SPACE         (%E2%80%84)
311       code_point == 0x2005 ||  // FOUR-PER-EM SPACE          (%E2%80%85)
312       code_point == 0x2006 ||  // SIX-PER-EM SPACE           (%E2%80%86)
313       code_point == 0x2007 ||  // FIGURE SPACE               (%E2%80%87)
314       code_point == 0x2008 ||  // PUNCTUATION SPACE          (%E2%80%88)
315       code_point == 0x2009 ||  // THIN SPACE                 (%E2%80%89)
316       code_point == 0x200A ||  // HAIR SPACE                 (%E2%80%8A)
317       code_point == 0x2028 ||  // LINE SEPARATOR             (%E2%80%A8)
318       code_point == 0x2029 ||  // PARAGRAPH SEPARATOR        (%E2%80%A9)
319       code_point == 0x202F ||  // NARROW NO-BREAK SPACE      (%E2%80%AF)
320       code_point == 0x205F ||  // MEDIUM MATHEMATICAL SPACE  (%E2%81%9F)
321       code_point == 0x3000 ||  // IDEOGRAPHIC SPACE          (%E3%80%80)
322       // U+2800 is rendered as a space, but is not considered whitespace (see
323       // crbug.com/1068531).
324       code_point == 0x2800 ||  // BRAILLE PATTERN BLANK      (%E2%A0%80)
325 
326       // Default Ignorable ([:Default_Ignorable_Code_Point=Yes:]) and Format
327       // characters ([:Cf:]) are also banned (see crbug.com/824715).
328       code_point == 0x00AD ||  // SOFT HYPHEN               (%C2%AD)
329       code_point == 0x034F ||  // COMBINING GRAPHEME JOINER (%CD%8F)
330       // Arabic number formatting
331       (code_point >= 0x0600 && code_point <= 0x0605) ||
332       // U+061C is already banned as a BiDi control character.
333       code_point == 0x06DD ||  // ARABIC END OF AYAH          (%DB%9D)
334       code_point == 0x070F ||  // SYRIAC ABBREVIATION MARK    (%DC%8F)
335       code_point == 0x08E2 ||  // ARABIC DISPUTED END OF AYAH (%E0%A3%A2)
336       code_point == 0x115F ||  // HANGUL CHOSEONG FILLER      (%E1%85%9F)
337       code_point == 0x1160 ||  // HANGUL JUNGSEONG FILLER     (%E1%85%A0)
338       code_point == 0x17B4 ||  // KHMER VOWEL INHERENT AQ     (%E1%9E%B4)
339       code_point == 0x17B5 ||  // KHMER VOWEL INHERENT AA     (%E1%9E%B5)
340       code_point == 0x180B ||  // MONGOLIAN FREE VARIATION SELECTOR ONE
341                                // (%E1%A0%8B)
342       code_point == 0x180C ||  // MONGOLIAN FREE VARIATION SELECTOR TWO
343                                // (%E1%A0%8C)
344       code_point == 0x180D ||  // MONGOLIAN FREE VARIATION SELECTOR THREE
345                                // (%E1%A0%8D)
346       code_point == 0x180E ||  // MONGOLIAN VOWEL SEPARATOR   (%E1%A0%8E)
347       code_point == 0x200B ||  // ZERO WIDTH SPACE            (%E2%80%8B)
348       code_point == 0x200C ||  // ZERO WIDTH SPACE NON-JOINER (%E2%80%8C)
349       code_point == 0x200D ||  // ZERO WIDTH JOINER           (%E2%80%8D)
350       // U+200E, U+200F, U+202A--202E, and U+2066--2069 are already banned as
351       // BiDi control characters.
352       code_point == 0x2060 ||  // WORD JOINER          (%E2%81%A0)
353       code_point == 0x2061 ||  // FUNCTION APPLICATION (%E2%81%A1)
354       code_point == 0x2062 ||  // INVISIBLE TIMES      (%E2%81%A2)
355       code_point == 0x2063 ||  // INVISIBLE SEPARATOR  (%E2%81%A3)
356       code_point == 0x2064 ||  // INVISIBLE PLUS       (%E2%81%A4)
357       code_point == 0x2065 ||  // null (%E2%81%A5)
358       // 0x2066--0x2069 are already banned as a BiDi control characters.
359       // General Punctuation - Deprecated (U+206A--206F)
360       (code_point >= 0x206A && code_point <= 0x206F) ||
361       code_point == 0x3164 ||  // HANGUL FILLER (%E3%85%A4)
362       (code_point >= 0xFFF0 && code_point <= 0xFFF8) ||  // null
363       // Variation selectors (%EF%B8%80 -- %EF%B8%8F)
364       (code_point >= 0xFE00 && code_point <= 0xFE0F) ||
365       code_point == 0xFEFF ||   // ZERO WIDTH NO-BREAK SPACE (%EF%BB%BF)
366       code_point == 0xFFA0 ||   // HALFWIDTH HANGUL FILLER (%EF%BE%A0)
367       code_point == 0xFFF9 ||   // INTERLINEAR ANNOTATION ANCHOR     (%EF%BF%B9)
368       code_point == 0xFFFA ||   // INTERLINEAR ANNOTATION SEPARATOR  (%EF%BF%BA)
369       code_point == 0xFFFB ||   // INTERLINEAR ANNOTATION TERMINATOR (%EF%BF%BB)
370       code_point == 0x110BD ||  // KAITHI NUMBER SIGN       (%F0%91%82%BD)
371       code_point == 0x110CD ||  // KAITHI NUMBER SIGN ABOVE (%F0%91%83%8D)
372       // Egyptian hieroglyph formatting (%F0%93%90%B0 -- %F0%93%90%B8)
373       (code_point >= 0x13430 && code_point <= 0x13438) ||
374       // Shorthand format controls (%F0%9B%B2%A0 -- %F0%9B%B2%A3)
375       (code_point >= 0x1BCA0 && code_point <= 0x1BCA3) ||
376       // Beams and slurs (%F0%9D%85%B3 -- %F0%9D%85%BA)
377       (code_point >= 0x1D173 && code_point <= 0x1D17A) ||
378       // Tags, Variation Selectors, nulls
379       (code_point >= 0xE0000 && code_point <= 0xE0FFF));
380 }
381 
382 // Unescapes |escaped_text| according to |rules|, returning the resulting
383 // string.  Fills in an |adjustments| parameter, if non-nullptr, so it reflects
384 // the alterations done to the string that are not one-character-to-one-
385 // character.  The resulting |adjustments| will always be sorted by increasing
386 // offset.
UnescapeURLWithAdjustmentsImpl(std::string_view escaped_text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)387 std::string UnescapeURLWithAdjustmentsImpl(
388     std::string_view escaped_text,
389     UnescapeRule::Type rules,
390     OffsetAdjuster::Adjustments* adjustments) {
391   if (adjustments)
392     adjustments->clear();
393   // Do not unescape anything, return the |escaped_text| text.
394   if (rules == UnescapeRule::NONE)
395     return std::string(escaped_text);
396 
397   // The output of the unescaping is always smaller than the input, so we can
398   // reserve the input size to make sure we have enough buffer and don't have
399   // to allocate in the loop below.
400   std::string result;
401   result.reserve(escaped_text.length());
402 
403   // Locations of adjusted text.
404   for (size_t i = 0, max = escaped_text.size(); i < max;) {
405     // Try to unescape the character.
406     base_icu::UChar32 code_point;
407     std::string unescaped;
408     if (!UnescapeUTF8CharacterAtIndex(escaped_text, i, &code_point,
409                                       &unescaped)) {
410       // Check if the next character can be unescaped, but not as a valid UTF-8
411       // character. In that case, just unescaped and write the non-sense
412       // character.
413       //
414       // TODO(crbug.com/40570496): Do not unescape illegal UTF-8
415       // sequences.
416       unsigned char non_utf8_byte;
417       if (UnescapeUnsignedByteAtIndex(escaped_text, i, &non_utf8_byte)) {
418         result.push_back(static_cast<char>(non_utf8_byte));
419         if (adjustments)
420           adjustments->push_back(OffsetAdjuster::Adjustment(i, 3, 1));
421         i += 3;
422         continue;
423       }
424 
425       // Character is not escaped, so append as is, unless it's a '+' and
426       // REPLACE_PLUS_WITH_SPACE is being applied.
427       if (escaped_text[i] == '+' &&
428           (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)) {
429         result.push_back(' ');
430       } else {
431         result.push_back(escaped_text[i]);
432       }
433       ++i;
434       continue;
435     }
436 
437     DCHECK(!unescaped.empty());
438 
439     if (!ShouldUnescapeCodePoint(rules, code_point)) {
440       // If it's a valid UTF-8 character, but not safe to unescape, copy all
441       // bytes directly.
442       result.append(escaped_text.substr(i, 3 * unescaped.length()));
443       i += unescaped.length() * 3;
444       continue;
445     }
446 
447     // If the code point is allowed, and append the entire unescaped character.
448     result.append(unescaped);
449     if (adjustments) {
450       for (size_t j = 0; j < unescaped.length(); ++j) {
451         adjustments->push_back(OffsetAdjuster::Adjustment(i + j * 3, 3, 1));
452       }
453     }
454     i += 3 * unescaped.length();
455   }
456 
457   return result;
458 }
459 
460 }  // namespace
461 
EscapeAllExceptUnreserved(std::string_view text)462 std::string EscapeAllExceptUnreserved(std::string_view text) {
463   return Escape(text, kUnreservedCharmap, false);
464 }
465 
EscapeQueryParamValue(std::string_view text,bool use_plus)466 std::string EscapeQueryParamValue(std::string_view text, bool use_plus) {
467   return Escape(text, kQueryCharmap, use_plus);
468 }
469 
EscapePath(std::string_view path)470 std::string EscapePath(std::string_view path) {
471   return Escape(path, kPathCharmap, false);
472 }
473 
474 #if BUILDFLAG(IS_APPLE)
EscapeNSURLPrecursor(std::string_view precursor)475 std::string EscapeNSURLPrecursor(std::string_view precursor) {
476   return Escape(precursor, kNSURLCharmap, false, true);
477 }
478 #endif  // BUILDFLAG(IS_APPLE)
479 
EscapeUrlEncodedData(std::string_view path,bool use_plus)480 std::string EscapeUrlEncodedData(std::string_view path, bool use_plus) {
481   return Escape(path, kUrlEscape, use_plus);
482 }
483 
EscapeNonASCIIAndPercent(std::string_view input)484 std::string EscapeNonASCIIAndPercent(std::string_view input) {
485   return Escape(input, kNonASCIICharmapAndPercent, false);
486 }
487 
EscapeNonASCII(std::string_view input)488 std::string EscapeNonASCII(std::string_view input) {
489   return Escape(input, kNonASCIICharmap, false);
490 }
491 
EscapeExternalHandlerValue(std::string_view text)492 std::string EscapeExternalHandlerValue(std::string_view text) {
493   return Escape(text, kExternalHandlerCharmap, false, true);
494 }
495 
AppendEscapedCharForHTML(char c,std::string * output)496 void AppendEscapedCharForHTML(char c, std::string* output) {
497   AppendEscapedCharForHTMLImpl(c, output);
498 }
499 
EscapeForHTML(std::string_view input)500 std::string EscapeForHTML(std::string_view input) {
501   return EscapeForHTMLImpl(input);
502 }
503 
EscapeForHTML(std::u16string_view input)504 std::u16string EscapeForHTML(std::u16string_view input) {
505   return EscapeForHTMLImpl(input);
506 }
507 
UnescapeURLComponent(std::string_view escaped_text,UnescapeRule::Type rules)508 std::string UnescapeURLComponent(std::string_view escaped_text,
509                                  UnescapeRule::Type rules) {
510   return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, nullptr);
511 }
512 
UnescapeAndDecodeUTF8URLComponentWithAdjustments(std::string_view text,UnescapeRule::Type rules,OffsetAdjuster::Adjustments * adjustments)513 std::u16string UnescapeAndDecodeUTF8URLComponentWithAdjustments(
514     std::string_view text,
515     UnescapeRule::Type rules,
516     OffsetAdjuster::Adjustments* adjustments) {
517   std::u16string result;
518   OffsetAdjuster::Adjustments unescape_adjustments;
519   std::string unescaped_url(
520       UnescapeURLWithAdjustmentsImpl(text, rules, &unescape_adjustments));
521   if (UTF8ToUTF16WithAdjustments(unescaped_url.data(), unescaped_url.length(),
522                                  &result, adjustments)) {
523     // Character set looks like it's valid.
524     if (adjustments) {
525       OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
526                                                  adjustments);
527     }
528     return result;
529   }
530   // Character set is not valid.  Return the escaped version.
531   return UTF8ToUTF16WithAdjustments(text, adjustments);
532 }
533 
UnescapeBinaryURLComponent(std::string_view escaped_text,UnescapeRule::Type rules)534 std::string UnescapeBinaryURLComponent(std::string_view escaped_text,
535                                        UnescapeRule::Type rules) {
536   // Only NORMAL and REPLACE_PLUS_WITH_SPACE are supported.
537   DCHECK(rules != UnescapeRule::NONE);
538   DCHECK(!(rules &
539            ~(UnescapeRule::NORMAL | UnescapeRule::REPLACE_PLUS_WITH_SPACE)));
540 
541   // If there are no '%' characters in the string, there will be nothing to
542   // unescape, so we can take the fast path.
543   if (escaped_text.find('%') == std::string_view::npos) {
544     std::string unescaped_text(escaped_text);
545     if (rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE)
546       std::replace(unescaped_text.begin(), unescaped_text.end(), '+', ' ');
547     return unescaped_text;
548   }
549 
550   std::string unescaped_text;
551 
552   // The output of the unescaping is always smaller than the input, so we can
553   // reserve the input size to make sure we have enough buffer and don't have
554   // to allocate in the loop below.
555   // Increase capacity before size, as just resizing can grow capacity
556   // needlessly beyond our requested size.
557   unescaped_text.reserve(escaped_text.size());
558   unescaped_text.resize(escaped_text.size());
559 
560   size_t output_index = 0;
561 
562   for (size_t i = 0, max = escaped_text.size(); i < max;) {
563     unsigned char byte;
564     // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
565     // to call.
566     if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
567       unescaped_text[output_index++] = static_cast<char>(byte);
568       i += 3;
569       continue;
570     }
571 
572     if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
573         escaped_text[i] == '+') {
574       unescaped_text[output_index++] = ' ';
575       ++i;
576       continue;
577     }
578 
579     unescaped_text[output_index++] = escaped_text[i++];
580   }
581 
582   DCHECK_LE(output_index, unescaped_text.size());
583   unescaped_text.resize(output_index);
584   return unescaped_text;
585 }
586 
UnescapeBinaryURLComponentSafe(std::string_view escaped_text,bool fail_on_path_separators,std::string * unescaped_text)587 bool UnescapeBinaryURLComponentSafe(std::string_view escaped_text,
588                                     bool fail_on_path_separators,
589                                     std::string* unescaped_text) {
590   unescaped_text->clear();
591 
592   std::set<unsigned char> illegal_encoded_bytes;
593   for (unsigned char c = '\x00'; c < '\x20'; ++c) {
594     illegal_encoded_bytes.insert(c);
595   }
596   if (fail_on_path_separators) {
597     illegal_encoded_bytes.insert('/');
598     illegal_encoded_bytes.insert('\\');
599   }
600   if (ContainsEncodedBytes(escaped_text, illegal_encoded_bytes))
601     return false;
602 
603   *unescaped_text = UnescapeBinaryURLComponent(escaped_text);
604   return true;
605 }
606 
ContainsEncodedBytes(std::string_view escaped_text,const std::set<unsigned char> & bytes)607 bool ContainsEncodedBytes(std::string_view escaped_text,
608                           const std::set<unsigned char>& bytes) {
609   for (size_t i = 0, max = escaped_text.size(); i < max;) {
610     unsigned char byte;
611     // UnescapeUnsignedByteAtIndex does bounds checking, so this is always safe
612     // to call.
613     if (UnescapeUnsignedByteAtIndex(escaped_text, i, &byte)) {
614       if (bytes.find(byte) != bytes.end())
615         return true;
616 
617       i += 3;
618       continue;
619     }
620 
621     ++i;
622   }
623 
624   return false;
625 }
626 
UnescapeForHTML(std::u16string_view input)627 std::u16string UnescapeForHTML(std::u16string_view input) {
628   static const struct {
629     const char* ampersand_code;
630     const char16_t replacement;
631   } kEscapeToChars[] = {
632       {"&lt;", '<'},   {"&gt;", '>'},   {"&amp;", '&'},
633       {"&quot;", '"'}, {"&#39;", '\''},
634   };
635   constexpr size_t kEscapeToCharsCount = std::size(kEscapeToChars);
636 
637   if (input.find(u"&") == std::string::npos)
638     return std::u16string(input);
639 
640   std::u16string ampersand_chars[kEscapeToCharsCount];
641   std::u16string text(input);
642   for (std::u16string::iterator iter = text.begin(); iter != text.end();
643        ++iter) {
644     if (*iter == '&') {
645       // Potential ampersand encode char.
646       size_t index = static_cast<size_t>(iter - text.begin());
647       for (size_t i = 0; i < std::size(kEscapeToChars); i++) {
648         if (ampersand_chars[i].empty()) {
649           ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
650         }
651         if (text.find(ampersand_chars[i], index) == index) {
652           text.replace(
653               iter, iter + static_cast<ptrdiff_t>(ampersand_chars[i].length()),
654               1, kEscapeToChars[i].replacement);
655           break;
656         }
657       }
658     }
659   }
660   return text;
661 }
662 
663 }  // namespace base
664