1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include <algorithm>
6
7 #include "net/base/escape.h"
8
9 #include "base/i18n/icu_string_conversions.h"
10 #include "base/logging.h"
11 #include "base/string_piece.h"
12 #include "base/utf_string_conversions.h"
13 #include "base/utf_offset_string_conversions.h"
14
15 namespace {
16
17 template <class char_type>
IsHex(char_type ch)18 inline bool IsHex(char_type ch) {
19 return (ch >= '0' && ch <= '9') ||
20 (ch >= 'A' && ch <= 'F') ||
21 (ch >= 'a' && ch <= 'f');
22 }
23
24 template <class char_type>
HexToInt(char_type ch)25 inline char_type HexToInt(char_type ch) {
26 if (ch >= '0' && ch <= '9')
27 return ch - '0';
28 if (ch >= 'A' && ch <= 'F')
29 return ch - 'A' + 10;
30 if (ch >= 'a' && ch <= 'f')
31 return ch - 'a' + 10;
32 NOTREACHED();
33 return 0;
34 }
35
36 static const char* const kHexString = "0123456789ABCDEF";
IntToHex(int i)37 inline char IntToHex(int i) {
38 DCHECK(i >= 0 && i <= 15) << i << " not a hex value";
39 return kHexString[i];
40 }
41
42 // A fast bit-vector map for ascii characters.
43 //
44 // Internally stores 256 bits in an array of 8 ints.
45 // Does quick bit-flicking to lookup needed characters.
46 class Charmap {
47 public:
Charmap(uint32 b0,uint32 b1,uint32 b2,uint32 b3,uint32 b4,uint32 b5,uint32 b6,uint32 b7)48 Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3,
49 uint32 b4, uint32 b5, uint32 b6, uint32 b7) {
50 map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3;
51 map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7;
52 }
53
Contains(unsigned char c) const54 bool Contains(unsigned char c) const {
55 return (map_[c >> 5] & (1 << (c & 31))) ? true : false;
56 }
57
58 private:
59 uint32 map_[8];
60 };
61
62 // Given text to escape and a Charmap defining which values to escape,
63 // return an escaped string. If use_plus is true, spaces are converted
64 // to +, otherwise, if spaces are in the charmap, they are converted to
65 // %20.
Escape(const std::string & text,const Charmap & charmap,bool use_plus)66 const std::string Escape(const std::string& text, const Charmap& charmap,
67 bool use_plus) {
68 std::string escaped;
69 escaped.reserve(text.length() * 3);
70 for (unsigned int i = 0; i < text.length(); ++i) {
71 unsigned char c = static_cast<unsigned char>(text[i]);
72 if (use_plus && ' ' == c) {
73 escaped.push_back('+');
74 } else if (charmap.Contains(c)) {
75 escaped.push_back('%');
76 escaped.push_back(IntToHex(c >> 4));
77 escaped.push_back(IntToHex(c & 0xf));
78 } else {
79 escaped.push_back(c);
80 }
81 }
82 return escaped;
83 }
84
85 // Contains nonzero when the corresponding character is unescapable for normal
86 // URLs. These characters are the ones that may change the parsing of a URL, so
87 // we don't want to unescape them sometimes. In many case we won't want to
88 // unescape spaces, but that is controlled by parameters to Unescape*.
89 //
90 // The basic rule is that we can't unescape anything that would changing parsing
91 // like # or ?. We also can't unescape &, =, or + since that could be part of a
92 // query and that could change the server's parsing of the query.
93 const char kUrlUnescape[128] = {
94 // NULL, control chars...
95 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97 // ' ' ! " # $ % & ' ( ) * + , - . /
98 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
99 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
100 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
101 // @ A B C D E F G H I J K L M N O
102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
103 // P Q R S T U V W X Y Z [ \ ] ^ _
104 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
105 // ` a b c d e f g h i j k l m n o
106 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
107 // p q r s t u v w x y z { | } ~ <NBSP>
108 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
109 };
110
111 template<typename STR>
UnescapeURLImpl(const STR & escaped_text,UnescapeRule::Type rules,size_t * offset_for_adjustment)112 STR UnescapeURLImpl(const STR& escaped_text,
113 UnescapeRule::Type rules,
114 size_t* offset_for_adjustment) {
115 size_t offset_temp = string16::npos;
116 if (!offset_for_adjustment)
117 offset_for_adjustment = &offset_temp;
118 else if (*offset_for_adjustment >= escaped_text.length())
119 *offset_for_adjustment = string16::npos;
120
121 // Do not unescape anything, return the |escaped_text| text.
122 if (rules == UnescapeRule::NONE)
123 return escaped_text;
124
125 // The output of the unescaping is always smaller than the input, so we can
126 // reserve the input size to make sure we have enough buffer and don't have
127 // to allocate in the loop below.
128 STR result;
129 result.reserve(escaped_text.length());
130
131 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
132 if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
133 // Non ASCII character, append as is.
134 result.push_back(escaped_text[i]);
135 continue;
136 }
137
138 char current_char = static_cast<char>(escaped_text[i]);
139 if (current_char == '%' && i + 2 < max) {
140 const typename STR::value_type most_sig_digit(
141 static_cast<typename STR::value_type>(escaped_text[i + 1]));
142 const typename STR::value_type least_sig_digit(
143 static_cast<typename STR::value_type>(escaped_text[i + 2]));
144 if (IsHex(most_sig_digit) && IsHex(least_sig_digit)) {
145 unsigned char value = HexToInt(most_sig_digit) * 16 +
146 HexToInt(least_sig_digit);
147 if (value >= 0x80 || // Unescape all high-bit characters.
148 // For 7-bit characters, the lookup table tells us all valid chars.
149 (kUrlUnescape[value] ||
150 // ...and we allow some additional unescaping when flags are set.
151 (value == ' ' && (rules & UnescapeRule::SPACES)) ||
152 // Allow any of the prohibited but non-control characters when
153 // we're doing "special" chars.
154 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
155 // Additionally allow control characters if requested.
156 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
157 // Use the unescaped version of the character.
158 size_t length_before_append = result.length();
159 result.push_back(value);
160 i += 2;
161
162 // Adjust offset to match length change.
163 if (*offset_for_adjustment != std::string::npos) {
164 if (*offset_for_adjustment > (length_before_append + 2))
165 *offset_for_adjustment -= 2;
166 else if (*offset_for_adjustment > length_before_append)
167 *offset_for_adjustment = std::string::npos;
168 }
169 } else {
170 // Keep escaped. Append a percent and we'll get the following two
171 // digits on the next loops through.
172 result.push_back('%');
173 }
174 } else {
175 // Invalid escape sequence, just pass the percent through and continue
176 // right after it.
177 result.push_back('%');
178 }
179 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
180 escaped_text[i] == '+') {
181 result.push_back(' ');
182 } else {
183 // Normal case for unescaped characters.
184 result.push_back(escaped_text[i]);
185 }
186 }
187
188 return result;
189 }
190
191 } // namespace
192
193 // Everything except alphanumerics and !'()*-._~
194 // See RFC 2396 for the list of reserved characters.
195 static const Charmap kQueryCharmap(
196 0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
197 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
198
EscapeQueryParamValue(const std::string & text,bool use_plus)199 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
200 return Escape(text, kQueryCharmap, use_plus);
201 }
202
203 // Convert the string to a sequence of bytes and then % escape anything
204 // except alphanumerics and !'()*-._~
EscapeQueryParamValueUTF8(const std::wstring & text,bool use_plus)205 std::wstring EscapeQueryParamValueUTF8(const std::wstring& text,
206 bool use_plus) {
207 return UTF8ToWide(Escape(WideToUTF8(text), kQueryCharmap, use_plus));
208 }
209
210 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|}
211 static const Charmap kPathCharmap(
212 0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
213 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
214
EscapePath(const std::string & path)215 std::string EscapePath(const std::string& path) {
216 return Escape(path, kPathCharmap, false);
217 }
218
219 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
220 static const Charmap kUrlEscape(
221 0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
222 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
223 );
224
EscapeUrlEncodedData(const std::string & path)225 std::string EscapeUrlEncodedData(const std::string& path) {
226 return Escape(path, kUrlEscape, true);
227 }
228
229 // non-7bit
230 static const Charmap kNonASCIICharmap(
231 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
232 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
233
EscapeNonASCII(const std::string & input)234 std::string EscapeNonASCII(const std::string& input) {
235 return Escape(input, kNonASCIICharmap, false);
236 }
237
238 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
239 // !'()*-._~%
240 static const Charmap kExternalHandlerCharmap(
241 0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L,
242 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
243
EscapeExternalHandlerValue(const std::string & text)244 std::string EscapeExternalHandlerValue(const std::string& text) {
245 return Escape(text, kExternalHandlerCharmap, false);
246 }
247
EscapeQueryParamValue(const string16 & text,const char * codepage,bool use_plus,string16 * escaped)248 bool EscapeQueryParamValue(const string16& text, const char* codepage,
249 bool use_plus, string16* escaped) {
250 // TODO(brettw) bug 1201094: this function should be removed, this "SKIP"
251 // behavior is wrong when the character can't be encoded properly.
252 std::string encoded;
253 if (!base::UTF16ToCodepage(text, codepage,
254 base::OnStringConversionError::SKIP, &encoded))
255 return false;
256
257 escaped->assign(UTF8ToUTF16(Escape(encoded, kQueryCharmap, use_plus)));
258 return true;
259 }
260
UnescapeAndDecodeUTF8URLComponent(const std::string & text,UnescapeRule::Type rules,size_t * offset_for_adjustment)261 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
262 UnescapeRule::Type rules,
263 size_t* offset_for_adjustment) {
264 std::wstring result;
265 size_t original_offset = offset_for_adjustment ? *offset_for_adjustment : 0;
266 std::string unescaped_url(
267 UnescapeURLImpl(text, rules, offset_for_adjustment));
268 if (UTF8ToWideAndAdjustOffset(unescaped_url.data(), unescaped_url.length(),
269 &result, offset_for_adjustment))
270 return WideToUTF16Hack(result); // Character set looks like it's valid.
271
272 // Not valid. Return the escaped version. Undo our changes to
273 // |offset_for_adjustment| since we haven't changed the string after all.
274 if (offset_for_adjustment)
275 *offset_for_adjustment = original_offset;
276 return WideToUTF16Hack(UTF8ToWideAndAdjustOffset(text,
277 offset_for_adjustment));
278 }
279
UnescapeURLComponent(const std::string & escaped_text,UnescapeRule::Type rules)280 std::string UnescapeURLComponent(const std::string& escaped_text,
281 UnescapeRule::Type rules) {
282 return UnescapeURLImpl(escaped_text, rules, NULL);
283 }
284
UnescapeURLComponent(const string16 & escaped_text,UnescapeRule::Type rules)285 string16 UnescapeURLComponent(const string16& escaped_text,
286 UnescapeRule::Type rules) {
287 return UnescapeURLImpl(escaped_text, rules, NULL);
288 }
289
290
291 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)292 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
293 static const struct {
294 char key;
295 const char* replacement;
296 } kCharsToEscape[] = {
297 { '<', "<" },
298 { '>', ">" },
299 { '&', "&" },
300 { '"', """ },
301 { '\'', "'" },
302 };
303 size_t k;
304 for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) {
305 if (c == kCharsToEscape[k].key) {
306 const char* p = kCharsToEscape[k].replacement;
307 while (*p)
308 output->push_back(*p++);
309 break;
310 }
311 }
312 if (k == ARRAYSIZE_UNSAFE(kCharsToEscape))
313 output->push_back(c);
314 }
315
AppendEscapedCharForHTML(char c,std::string * output)316 void AppendEscapedCharForHTML(char c, std::string* output) {
317 AppendEscapedCharForHTMLImpl(c, output);
318 }
319
AppendEscapedCharForHTML(wchar_t c,string16 * output)320 void AppendEscapedCharForHTML(wchar_t c, string16* output) {
321 AppendEscapedCharForHTMLImpl(c, output);
322 }
323
324 template <class str>
EscapeForHTMLImpl(const str & input)325 str EscapeForHTMLImpl(const str& input) {
326 str result;
327 result.reserve(input.size()); // optimize for no escaping
328
329 for (typename str::const_iterator it = input.begin(); it != input.end(); ++it)
330 AppendEscapedCharForHTMLImpl(*it, &result);
331
332 return result;
333 }
334
EscapeForHTML(const std::string & input)335 std::string EscapeForHTML(const std::string& input) {
336 return EscapeForHTMLImpl(input);
337 }
338
EscapeForHTML(const string16 & input)339 string16 EscapeForHTML(const string16& input) {
340 return EscapeForHTMLImpl(input);
341 }
342
UnescapeForHTML(const string16 & input)343 string16 UnescapeForHTML(const string16& input) {
344 static const struct {
345 const wchar_t* ampersand_code;
346 const char replacement;
347 } kEscapeToChars[] = {
348 { L"<", '<' },
349 { L">", '>' },
350 { L"&", '&' },
351 { L""", '"' },
352 { L"'", '\''},
353 };
354
355 if (input.find(WideToUTF16(L"&")) == std::string::npos)
356 return input;
357
358 string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
359 string16 text(input);
360 for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) {
361 if (*iter == '&') {
362 // Potential ampersand encode char.
363 size_t index = iter - text.begin();
364 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
365 if (ampersand_chars[i].empty())
366 ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code);
367 if (text.find(ampersand_chars[i], index) == index) {
368 text.replace(iter, iter + ampersand_chars[i].length(),
369 1, kEscapeToChars[i].replacement);
370 break;
371 }
372 }
373 }
374 }
375 return text;
376 }
377