• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include <algorithm>
6 
7 #include "net/base/escape.h"
8 
9 #include "base/i18n/icu_string_conversions.h"
10 #include "base/logging.h"
11 #include "base/string_piece.h"
12 #include "base/utf_string_conversions.h"
13 #include "base/utf_offset_string_conversions.h"
14 
15 namespace {
16 
17 template <class char_type>
IsHex(char_type ch)18 inline bool IsHex(char_type ch) {
19   return (ch >= '0' && ch <= '9') ||
20          (ch >= 'A' && ch <= 'F') ||
21          (ch >= 'a' && ch <= 'f');
22 }
23 
24 template <class char_type>
HexToInt(char_type ch)25 inline char_type HexToInt(char_type ch) {
26   if (ch >= '0' && ch <= '9')
27     return ch - '0';
28   if (ch >= 'A' && ch <= 'F')
29     return ch - 'A' + 10;
30   if (ch >= 'a' && ch <= 'f')
31     return ch - 'a' + 10;
32   NOTREACHED();
33   return 0;
34 }
35 
36 static const char* const kHexString = "0123456789ABCDEF";
IntToHex(int i)37 inline char IntToHex(int i) {
38   DCHECK(i >= 0 && i <= 15) << i << " not a hex value";
39   return kHexString[i];
40 }
41 
42 // A fast bit-vector map for ascii characters.
43 //
44 // Internally stores 256 bits in an array of 8 ints.
45 // Does quick bit-flicking to lookup needed characters.
46 class Charmap {
47  public:
Charmap(uint32 b0,uint32 b1,uint32 b2,uint32 b3,uint32 b4,uint32 b5,uint32 b6,uint32 b7)48   Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3,
49           uint32 b4, uint32 b5, uint32 b6, uint32 b7) {
50     map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3;
51     map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7;
52   }
53 
Contains(unsigned char c) const54   bool Contains(unsigned char c) const {
55     return (map_[c >> 5] & (1 << (c & 31))) ? true : false;
56   }
57 
58  private:
59   uint32 map_[8];
60 };
61 
62 // Given text to escape and a Charmap defining which values to escape,
63 // return an escaped string.  If use_plus is true, spaces are converted
64 // to +, otherwise, if spaces are in the charmap, they are converted to
65 // %20.
Escape(const std::string & text,const Charmap & charmap,bool use_plus)66 const std::string Escape(const std::string& text, const Charmap& charmap,
67                          bool use_plus) {
68   std::string escaped;
69   escaped.reserve(text.length() * 3);
70   for (unsigned int i = 0; i < text.length(); ++i) {
71     unsigned char c = static_cast<unsigned char>(text[i]);
72     if (use_plus && ' ' == c) {
73       escaped.push_back('+');
74     } else if (charmap.Contains(c)) {
75       escaped.push_back('%');
76       escaped.push_back(IntToHex(c >> 4));
77       escaped.push_back(IntToHex(c & 0xf));
78     } else {
79       escaped.push_back(c);
80     }
81   }
82   return escaped;
83 }
84 
85 // Contains nonzero when the corresponding character is unescapable for normal
86 // URLs. These characters are the ones that may change the parsing of a URL, so
87 // we don't want to unescape them sometimes. In many case we won't want to
88 // unescape spaces, but that is controlled by parameters to Unescape*.
89 //
90 // The basic rule is that we can't unescape anything that would changing parsing
91 // like # or ?. We also can't unescape &, =, or + since that could be part of a
92 // query and that could change the server's parsing of the query.
93 const char kUrlUnescape[128] = {
94 //   NULL, control chars...
95      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
98      0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
99 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
100      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
101 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
102      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
104      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
106      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
108      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
109 };
110 
111 template<typename STR>
UnescapeURLImpl(const STR & escaped_text,UnescapeRule::Type rules,size_t * offset_for_adjustment)112 STR UnescapeURLImpl(const STR& escaped_text,
113                     UnescapeRule::Type rules,
114                     size_t* offset_for_adjustment) {
115   size_t offset_temp = string16::npos;
116   if (!offset_for_adjustment)
117     offset_for_adjustment = &offset_temp;
118   else if (*offset_for_adjustment >= escaped_text.length())
119     *offset_for_adjustment = string16::npos;
120 
121   // Do not unescape anything, return the |escaped_text| text.
122   if (rules == UnescapeRule::NONE)
123     return escaped_text;
124 
125   // The output of the unescaping is always smaller than the input, so we can
126   // reserve the input size to make sure we have enough buffer and don't have
127   // to allocate in the loop below.
128   STR result;
129   result.reserve(escaped_text.length());
130 
131   for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
132     if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
133       // Non ASCII character, append as is.
134       result.push_back(escaped_text[i]);
135       continue;
136     }
137 
138     char current_char = static_cast<char>(escaped_text[i]);
139     if (current_char == '%' && i + 2 < max) {
140       const typename STR::value_type most_sig_digit(
141           static_cast<typename STR::value_type>(escaped_text[i + 1]));
142       const typename STR::value_type least_sig_digit(
143           static_cast<typename STR::value_type>(escaped_text[i + 2]));
144       if (IsHex(most_sig_digit) && IsHex(least_sig_digit)) {
145         unsigned char value = HexToInt(most_sig_digit) * 16 +
146             HexToInt(least_sig_digit);
147         if (value >= 0x80 ||  // Unescape all high-bit characters.
148             // For 7-bit characters, the lookup table tells us all valid chars.
149             (kUrlUnescape[value] ||
150              // ...and we allow some additional unescaping when flags are set.
151              (value == ' ' && (rules & UnescapeRule::SPACES)) ||
152              // Allow any of the prohibited but non-control characters when
153              // we're doing "special" chars.
154              (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
155              // Additionally allow control characters if requested.
156              (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
157           // Use the unescaped version of the character.
158           size_t length_before_append = result.length();
159           result.push_back(value);
160           i += 2;
161 
162           // Adjust offset to match length change.
163           if (*offset_for_adjustment != std::string::npos) {
164             if (*offset_for_adjustment > (length_before_append + 2))
165               *offset_for_adjustment -= 2;
166             else if (*offset_for_adjustment > length_before_append)
167               *offset_for_adjustment = std::string::npos;
168           }
169         } else {
170           // Keep escaped. Append a percent and we'll get the following two
171           // digits on the next loops through.
172           result.push_back('%');
173         }
174       } else {
175         // Invalid escape sequence, just pass the percent through and continue
176         // right after it.
177         result.push_back('%');
178       }
179     } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
180                escaped_text[i] == '+') {
181       result.push_back(' ');
182     } else {
183       // Normal case for unescaped characters.
184       result.push_back(escaped_text[i]);
185     }
186   }
187 
188   return result;
189 }
190 
191 }  // namespace
192 
193 // Everything except alphanumerics and !'()*-._~
194 // See RFC 2396 for the list of reserved characters.
195 static const Charmap kQueryCharmap(
196   0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
197   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
198 
EscapeQueryParamValue(const std::string & text,bool use_plus)199 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
200   return Escape(text, kQueryCharmap, use_plus);
201 }
202 
203 // Convert the string to a sequence of bytes and then % escape anything
204 // except alphanumerics and !'()*-._~
EscapeQueryParamValueUTF8(const std::wstring & text,bool use_plus)205 std::wstring EscapeQueryParamValueUTF8(const std::wstring& text,
206                                        bool use_plus) {
207   return UTF8ToWide(Escape(WideToUTF8(text), kQueryCharmap, use_plus));
208 }
209 
210 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
211 static const Charmap kPathCharmap(
212   0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
213   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
214 
EscapePath(const std::string & path)215 std::string EscapePath(const std::string& path) {
216   return Escape(path, kPathCharmap, false);
217 }
218 
219 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
220 static const Charmap kUrlEscape(
221   0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
222   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
223 );
224 
EscapeUrlEncodedData(const std::string & path)225 std::string EscapeUrlEncodedData(const std::string& path) {
226   return Escape(path, kUrlEscape, true);
227 }
228 
229 // non-7bit
230 static const Charmap kNonASCIICharmap(
231   0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
232   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
233 
EscapeNonASCII(const std::string & input)234 std::string EscapeNonASCII(const std::string& input) {
235   return Escape(input, kNonASCIICharmap, false);
236 }
237 
238 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
239 // !'()*-._~%
240 static const Charmap kExternalHandlerCharmap(
241   0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L,
242   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
243 
EscapeExternalHandlerValue(const std::string & text)244 std::string EscapeExternalHandlerValue(const std::string& text) {
245   return Escape(text, kExternalHandlerCharmap, false);
246 }
247 
EscapeQueryParamValue(const string16 & text,const char * codepage,bool use_plus,string16 * escaped)248 bool EscapeQueryParamValue(const string16& text, const char* codepage,
249                            bool use_plus, string16* escaped) {
250   // TODO(brettw) bug 1201094: this function should be removed, this "SKIP"
251   // behavior is wrong when the character can't be encoded properly.
252   std::string encoded;
253   if (!base::UTF16ToCodepage(text, codepage,
254                              base::OnStringConversionError::SKIP, &encoded))
255     return false;
256 
257   escaped->assign(UTF8ToUTF16(Escape(encoded, kQueryCharmap, use_plus)));
258   return true;
259 }
260 
UnescapeAndDecodeUTF8URLComponent(const std::string & text,UnescapeRule::Type rules,size_t * offset_for_adjustment)261 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
262                                            UnescapeRule::Type rules,
263                                            size_t* offset_for_adjustment) {
264   std::wstring result;
265   size_t original_offset = offset_for_adjustment ? *offset_for_adjustment : 0;
266   std::string unescaped_url(
267       UnescapeURLImpl(text, rules, offset_for_adjustment));
268   if (UTF8ToWideAndAdjustOffset(unescaped_url.data(), unescaped_url.length(),
269                                 &result, offset_for_adjustment))
270     return WideToUTF16Hack(result);      // Character set looks like it's valid.
271 
272   // Not valid.  Return the escaped version.  Undo our changes to
273   // |offset_for_adjustment| since we haven't changed the string after all.
274   if (offset_for_adjustment)
275     *offset_for_adjustment = original_offset;
276   return WideToUTF16Hack(UTF8ToWideAndAdjustOffset(text,
277                                                    offset_for_adjustment));
278 }
279 
UnescapeURLComponent(const std::string & escaped_text,UnescapeRule::Type rules)280 std::string UnescapeURLComponent(const std::string& escaped_text,
281                                  UnescapeRule::Type rules) {
282   return UnescapeURLImpl(escaped_text, rules, NULL);
283 }
284 
UnescapeURLComponent(const string16 & escaped_text,UnescapeRule::Type rules)285 string16 UnescapeURLComponent(const string16& escaped_text,
286                               UnescapeRule::Type rules) {
287   return UnescapeURLImpl(escaped_text, rules, NULL);
288 }
289 
290 
291 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)292 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
293   static const struct {
294     char key;
295     const char* replacement;
296   } kCharsToEscape[] = {
297     { '<', "&lt;" },
298     { '>', "&gt;" },
299     { '&', "&amp;" },
300     { '"', "&quot;" },
301     { '\'', "&#39;" },
302   };
303   size_t k;
304   for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) {
305     if (c == kCharsToEscape[k].key) {
306       const char* p = kCharsToEscape[k].replacement;
307       while (*p)
308         output->push_back(*p++);
309       break;
310     }
311   }
312   if (k == ARRAYSIZE_UNSAFE(kCharsToEscape))
313     output->push_back(c);
314 }
315 
AppendEscapedCharForHTML(char c,std::string * output)316 void AppendEscapedCharForHTML(char c, std::string* output) {
317   AppendEscapedCharForHTMLImpl(c, output);
318 }
319 
AppendEscapedCharForHTML(wchar_t c,string16 * output)320 void AppendEscapedCharForHTML(wchar_t c, string16* output) {
321   AppendEscapedCharForHTMLImpl(c, output);
322 }
323 
324 template <class str>
EscapeForHTMLImpl(const str & input)325 str EscapeForHTMLImpl(const str& input) {
326   str result;
327   result.reserve(input.size());  // optimize for no escaping
328 
329   for (typename str::const_iterator it = input.begin(); it != input.end(); ++it)
330     AppendEscapedCharForHTMLImpl(*it, &result);
331 
332   return result;
333 }
334 
EscapeForHTML(const std::string & input)335 std::string EscapeForHTML(const std::string& input) {
336   return EscapeForHTMLImpl(input);
337 }
338 
EscapeForHTML(const string16 & input)339 string16 EscapeForHTML(const string16& input) {
340   return EscapeForHTMLImpl(input);
341 }
342 
UnescapeForHTML(const string16 & input)343 string16 UnescapeForHTML(const string16& input) {
344   static const struct {
345     const wchar_t* ampersand_code;
346     const char replacement;
347   } kEscapeToChars[] = {
348     { L"&lt;", '<' },
349     { L"&gt;", '>' },
350     { L"&amp;", '&' },
351     { L"&quot;", '"' },
352     { L"&#39;", '\''},
353   };
354 
355   if (input.find(WideToUTF16(L"&")) == std::string::npos)
356     return input;
357 
358   string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
359   string16 text(input);
360   for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) {
361     if (*iter == '&') {
362       // Potential ampersand encode char.
363       size_t index = iter - text.begin();
364       for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
365         if (ampersand_chars[i].empty())
366           ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code);
367         if (text.find(ampersand_chars[i], index) == index) {
368           text.replace(iter, iter + ampersand_chars[i].length(),
369                        1, kEscapeToChars[i].replacement);
370           break;
371         }
372       }
373     }
374   }
375   return text;
376 }
377