• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/base/escape.h"
6 
7 #include <algorithm>
8 
9 #include "base/logging.h"
10 #include "base/scoped_ptr.h"
11 #include "base/string_piece.h"
12 #include "base/string_util.h"
13 #include "base/utf_string_conversions.h"
14 #include "base/utf_offset_string_conversions.h"
15 
16 namespace {
17 
18 static const char* const kHexString = "0123456789ABCDEF";
IntToHex(int i)19 inline char IntToHex(int i) {
20   DCHECK(i >= 0 && i <= 15) << i << " not a hex value";
21   return kHexString[i];
22 }
23 
24 // A fast bit-vector map for ascii characters.
25 //
26 // Internally stores 256 bits in an array of 8 ints.
27 // Does quick bit-flicking to lookup needed characters.
28 class Charmap {
29  public:
Charmap(uint32 b0,uint32 b1,uint32 b2,uint32 b3,uint32 b4,uint32 b5,uint32 b6,uint32 b7)30   Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3,
31           uint32 b4, uint32 b5, uint32 b6, uint32 b7) {
32     map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3;
33     map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7;
34   }
35 
Contains(unsigned char c) const36   bool Contains(unsigned char c) const {
37     return (map_[c >> 5] & (1 << (c & 31))) ? true : false;
38   }
39 
40  private:
41   uint32 map_[8];
42 };
43 
44 // Given text to escape and a Charmap defining which values to escape,
45 // return an escaped string.  If use_plus is true, spaces are converted
46 // to +, otherwise, if spaces are in the charmap, they are converted to
47 // %20.
Escape(const std::string & text,const Charmap & charmap,bool use_plus)48 std::string Escape(const std::string& text, const Charmap& charmap,
49                    bool use_plus) {
50   std::string escaped;
51   escaped.reserve(text.length() * 3);
52   for (unsigned int i = 0; i < text.length(); ++i) {
53     unsigned char c = static_cast<unsigned char>(text[i]);
54     if (use_plus && ' ' == c) {
55       escaped.push_back('+');
56     } else if (charmap.Contains(c)) {
57       escaped.push_back('%');
58       escaped.push_back(IntToHex(c >> 4));
59       escaped.push_back(IntToHex(c & 0xf));
60     } else {
61       escaped.push_back(c);
62     }
63   }
64   return escaped;
65 }
66 
67 // Contains nonzero when the corresponding character is unescapable for normal
68 // URLs. These characters are the ones that may change the parsing of a URL, so
69 // we don't want to unescape them sometimes. In many case we won't want to
70 // unescape spaces, but that is controlled by parameters to Unescape*.
71 //
72 // The basic rule is that we can't unescape anything that would changing parsing
73 // like # or ?. We also can't unescape &, =, or + since that could be part of a
74 // query and that could change the server's parsing of the query. Nor can we
75 // unescape \ since googleurl will convert it to a /.
76 //
77 // Lastly, we can't unescape anything that doesn't have a canonical
78 // representation in a URL. This means that unescaping will change the URL, and
79 // you could get different behavior if you copy and paste the URL, or press
80 // enter in the URL bar. The list of characters that fall into this category
81 // are the ones labeled PASS (allow either escaped or unescaped) in the big
82 // lookup table at the top of googleurl/src/url_canon_path.cc
83 const char kUrlUnescape[128] = {
84 //   NULL, control chars...
85      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
88      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
89 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
90      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
91 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
92      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
94      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
95 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
96      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
97 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
98      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
99 };
100 
101 template<typename STR>
UnescapeURLWithOffsetsImpl(const STR & escaped_text,UnescapeRule::Type rules,std::vector<size_t> * offsets_for_adjustment)102 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,
103                                UnescapeRule::Type rules,
104                                std::vector<size_t>* offsets_for_adjustment) {
105   if (offsets_for_adjustment) {
106     std::for_each(offsets_for_adjustment->begin(),
107                   offsets_for_adjustment->end(),
108                   LimitOffset<std::wstring>(escaped_text.length()));
109   }
110   // Do not unescape anything, return the |escaped_text| text.
111   if (rules == UnescapeRule::NONE)
112     return escaped_text;
113 
114   // The output of the unescaping is always smaller than the input, so we can
115   // reserve the input size to make sure we have enough buffer and don't have
116   // to allocate in the loop below.
117   STR result;
118   result.reserve(escaped_text.length());
119 
120   AdjustEncodingOffset::Adjustments adjustments;  // Locations of adjusted text.
121   for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
122     if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
123       // Non ASCII character, append as is.
124       result.push_back(escaped_text[i]);
125       continue;
126     }
127 
128     char current_char = static_cast<char>(escaped_text[i]);
129     if (current_char == '%' && i + 2 < max) {
130       const typename STR::value_type most_sig_digit(
131           static_cast<typename STR::value_type>(escaped_text[i + 1]));
132       const typename STR::value_type least_sig_digit(
133           static_cast<typename STR::value_type>(escaped_text[i + 2]));
134       if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
135         unsigned char value = HexDigitToInt(most_sig_digit) * 16 +
136             HexDigitToInt(least_sig_digit);
137         if (value >= 0x80 ||  // Unescape all high-bit characters.
138             // For 7-bit characters, the lookup table tells us all valid chars.
139             (kUrlUnescape[value] ||
140              // ...and we allow some additional unescaping when flags are set.
141              (value == ' ' && (rules & UnescapeRule::SPACES)) ||
142              // Allow any of the prohibited but non-control characters when
143              // we're doing "special" chars.
144              (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
145              // Additionally allow control characters if requested.
146              (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
147           // Use the unescaped version of the character.
148           adjustments.push_back(i);
149           result.push_back(value);
150           i += 2;
151         } else {
152           // Keep escaped. Append a percent and we'll get the following two
153           // digits on the next loops through.
154           result.push_back('%');
155         }
156       } else {
157         // Invalid escape sequence, just pass the percent through and continue
158         // right after it.
159         result.push_back('%');
160       }
161     } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
162                escaped_text[i] == '+') {
163       result.push_back(' ');
164     } else {
165       // Normal case for unescaped characters.
166       result.push_back(escaped_text[i]);
167     }
168   }
169 
170   // Make offset adjustment.
171   if (offsets_for_adjustment && !adjustments.empty()) {
172     std::for_each(offsets_for_adjustment->begin(),
173                    offsets_for_adjustment->end(),
174                    AdjustEncodingOffset(adjustments));
175   }
176 
177   return result;
178 }
179 
180 template<typename STR>
UnescapeURLImpl(const STR & escaped_text,UnescapeRule::Type rules,size_t * offset_for_adjustment)181 STR UnescapeURLImpl(const STR& escaped_text,
182                     UnescapeRule::Type rules,
183                     size_t* offset_for_adjustment) {
184   std::vector<size_t> offsets;
185   if (offset_for_adjustment)
186     offsets.push_back(*offset_for_adjustment);
187   STR result = UnescapeURLWithOffsetsImpl(escaped_text, rules, &offsets);
188   if (offset_for_adjustment)
189     *offset_for_adjustment = offsets[0];
190   return result;
191 }
192 
193 }  // namespace
194 
195 // Everything except alphanumerics and !'()*-._~
196 // See RFC 2396 for the list of reserved characters.
197 static const Charmap kQueryCharmap(
198   0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
199   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
200 
EscapeQueryParamValue(const std::string & text,bool use_plus)201 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
202   return Escape(text, kQueryCharmap, use_plus);
203 }
204 
205 // Convert the string to a sequence of bytes and then % escape anything
206 // except alphanumerics and !'()*-._~
EscapeQueryParamValueUTF8(const string16 & text,bool use_plus)207 string16 EscapeQueryParamValueUTF8(const string16& text,
208                                    bool use_plus) {
209   return UTF8ToUTF16(Escape(UTF16ToUTF8(text), kQueryCharmap, use_plus));
210 }
211 
212 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
213 static const Charmap kPathCharmap(
214   0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
215   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
216 
EscapePath(const std::string & path)217 std::string EscapePath(const std::string& path) {
218   return Escape(path, kPathCharmap, false);
219 }
220 
221 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
222 static const Charmap kUrlEscape(
223   0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
224   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
225 );
226 
EscapeUrlEncodedData(const std::string & path)227 std::string EscapeUrlEncodedData(const std::string& path) {
228   return Escape(path, kUrlEscape, true);
229 }
230 
231 // non-7bit
232 static const Charmap kNonASCIICharmap(
233   0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
234   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
235 
EscapeNonASCII(const std::string & input)236 std::string EscapeNonASCII(const std::string& input) {
237   return Escape(input, kNonASCIICharmap, false);
238 }
239 
240 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
241 // !'()*-._~%
242 static const Charmap kExternalHandlerCharmap(
243   0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L,
244   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
245 
EscapeExternalHandlerValue(const std::string & text)246 std::string EscapeExternalHandlerValue(const std::string& text) {
247   return Escape(text, kExternalHandlerCharmap, false);
248 }
249 
UnescapeAndDecodeUTF8URLComponentWithOffsets(const std::string & text,UnescapeRule::Type rules,std::vector<size_t> * offsets_for_adjustment)250 string16 UnescapeAndDecodeUTF8URLComponentWithOffsets(
251     const std::string& text,
252     UnescapeRule::Type rules,
253     std::vector<size_t>* offsets_for_adjustment) {
254   std::wstring result;
255   std::vector<size_t> original_offsets;
256   if (offsets_for_adjustment)
257     original_offsets = *offsets_for_adjustment;
258   std::string unescaped_url(
259       UnescapeURLWithOffsetsImpl(text, rules, offsets_for_adjustment));
260   if (UTF8ToWideAndAdjustOffsets(unescaped_url.data(), unescaped_url.length(),
261                                 &result, offsets_for_adjustment))
262     return WideToUTF16Hack(result);      // Character set looks like it's valid.
263 
264   // Not valid.  Return the escaped version.  Undo our changes to
265   // |offset_for_adjustment| since we haven't changed the string after all.
266   if (offsets_for_adjustment)
267     *offsets_for_adjustment = original_offsets;
268   return WideToUTF16Hack(UTF8ToWideAndAdjustOffsets(
269       text, offsets_for_adjustment));
270 }
271 
UnescapeAndDecodeUTF8URLComponent(const std::string & text,UnescapeRule::Type rules,size_t * offset_for_adjustment)272 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
273                                            UnescapeRule::Type rules,
274                                            size_t* offset_for_adjustment) {
275   std::vector<size_t> offsets;
276   if (offset_for_adjustment)
277     offsets.push_back(*offset_for_adjustment);
278   string16 result =
279       UnescapeAndDecodeUTF8URLComponentWithOffsets(text, rules, &offsets);
280   if (offset_for_adjustment)
281     *offset_for_adjustment = offsets[0];
282   return result;
283 }
284 
UnescapeURLComponent(const std::string & escaped_text,UnescapeRule::Type rules)285 std::string UnescapeURLComponent(const std::string& escaped_text,
286                                  UnescapeRule::Type rules) {
287   return UnescapeURLWithOffsetsImpl<std::string>(escaped_text, rules, NULL);
288 }
289 
UnescapeURLComponent(const string16 & escaped_text,UnescapeRule::Type rules)290 string16 UnescapeURLComponent(const string16& escaped_text,
291                               UnescapeRule::Type rules) {
292   return UnescapeURLWithOffsetsImpl<string16>(escaped_text, rules, NULL);
293 }
294 
295 
296 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)297 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
298   static const struct {
299     char key;
300     const char* replacement;
301   } kCharsToEscape[] = {
302     { '<', "&lt;" },
303     { '>', "&gt;" },
304     { '&', "&amp;" },
305     { '"', "&quot;" },
306     { '\'', "&#39;" },
307   };
308   size_t k;
309   for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) {
310     if (c == kCharsToEscape[k].key) {
311       const char* p = kCharsToEscape[k].replacement;
312       while (*p)
313         output->push_back(*p++);
314       break;
315     }
316   }
317   if (k == ARRAYSIZE_UNSAFE(kCharsToEscape))
318     output->push_back(c);
319 }
320 
AppendEscapedCharForHTML(char c,std::string * output)321 void AppendEscapedCharForHTML(char c, std::string* output) {
322   AppendEscapedCharForHTMLImpl(c, output);
323 }
324 
AppendEscapedCharForHTML(wchar_t c,string16 * output)325 void AppendEscapedCharForHTML(wchar_t c, string16* output) {
326   AppendEscapedCharForHTMLImpl(c, output);
327 }
328 
329 template <class str>
EscapeForHTMLImpl(const str & input)330 str EscapeForHTMLImpl(const str& input) {
331   str result;
332   result.reserve(input.size());  // optimize for no escaping
333 
334   for (typename str::const_iterator it = input.begin(); it != input.end(); ++it)
335     AppendEscapedCharForHTMLImpl(*it, &result);
336 
337   return result;
338 }
339 
EscapeForHTML(const std::string & input)340 std::string EscapeForHTML(const std::string& input) {
341   return EscapeForHTMLImpl(input);
342 }
343 
EscapeForHTML(const string16 & input)344 string16 EscapeForHTML(const string16& input) {
345   return EscapeForHTMLImpl(input);
346 }
347 
UnescapeForHTML(const string16 & input)348 string16 UnescapeForHTML(const string16& input) {
349   static const struct {
350     const wchar_t* ampersand_code;
351     const char replacement;
352   } kEscapeToChars[] = {
353     { L"&lt;", '<' },
354     { L"&gt;", '>' },
355     { L"&amp;", '&' },
356     { L"&quot;", '"' },
357     { L"&#39;", '\''},
358   };
359 
360   if (input.find(WideToUTF16(L"&")) == std::string::npos)
361     return input;
362 
363   string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
364   string16 text(input);
365   for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) {
366     if (*iter == '&') {
367       // Potential ampersand encode char.
368       size_t index = iter - text.begin();
369       for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
370         if (ampersand_chars[i].empty())
371           ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code);
372         if (text.find(ampersand_chars[i], index) == index) {
373           text.replace(iter, iter + ampersand_chars[i].length(),
374                        1, kEscapeToChars[i].replacement);
375           break;
376         }
377       }
378     }
379   }
380   return text;
381 }
382 
AdjustEncodingOffset(const Adjustments & adjustments)383 AdjustEncodingOffset::AdjustEncodingOffset(const Adjustments& adjustments)
384   : adjustments(adjustments) {}
385 
operator ()(size_t & offset)386 void AdjustEncodingOffset::operator()(size_t& offset) {
387   // For each encoded character occurring before an offset subtract 2.
388   if (offset == string16::npos)
389     return;
390   size_t adjusted_offset = offset;
391   for (Adjustments::const_iterator i = adjustments.begin();
392        i != adjustments.end(); ++i) {
393     size_t location = *i;
394     if (offset <= location) {
395       offset = adjusted_offset;
396       return;
397     }
398     if (offset <= (location + 2)) {
399       offset = string16::npos;
400       return;
401     }
402     adjusted_offset -= 2;
403   }
404   offset = adjusted_offset;
405 }
406