1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/base/escape.h"
6
7 #include <algorithm>
8
9 #include "base/logging.h"
10 #include "base/scoped_ptr.h"
11 #include "base/string_piece.h"
12 #include "base/string_util.h"
13 #include "base/utf_string_conversions.h"
14 #include "base/utf_offset_string_conversions.h"
15
16 namespace {
17
18 static const char* const kHexString = "0123456789ABCDEF";
IntToHex(int i)19 inline char IntToHex(int i) {
20 DCHECK(i >= 0 && i <= 15) << i << " not a hex value";
21 return kHexString[i];
22 }
23
24 // A fast bit-vector map for ascii characters.
25 //
26 // Internally stores 256 bits in an array of 8 ints.
27 // Does quick bit-flicking to lookup needed characters.
28 class Charmap {
29 public:
Charmap(uint32 b0,uint32 b1,uint32 b2,uint32 b3,uint32 b4,uint32 b5,uint32 b6,uint32 b7)30 Charmap(uint32 b0, uint32 b1, uint32 b2, uint32 b3,
31 uint32 b4, uint32 b5, uint32 b6, uint32 b7) {
32 map_[0] = b0; map_[1] = b1; map_[2] = b2; map_[3] = b3;
33 map_[4] = b4; map_[5] = b5; map_[6] = b6; map_[7] = b7;
34 }
35
Contains(unsigned char c) const36 bool Contains(unsigned char c) const {
37 return (map_[c >> 5] & (1 << (c & 31))) ? true : false;
38 }
39
40 private:
41 uint32 map_[8];
42 };
43
44 // Given text to escape and a Charmap defining which values to escape,
45 // return an escaped string. If use_plus is true, spaces are converted
46 // to +, otherwise, if spaces are in the charmap, they are converted to
47 // %20.
Escape(const std::string & text,const Charmap & charmap,bool use_plus)48 std::string Escape(const std::string& text, const Charmap& charmap,
49 bool use_plus) {
50 std::string escaped;
51 escaped.reserve(text.length() * 3);
52 for (unsigned int i = 0; i < text.length(); ++i) {
53 unsigned char c = static_cast<unsigned char>(text[i]);
54 if (use_plus && ' ' == c) {
55 escaped.push_back('+');
56 } else if (charmap.Contains(c)) {
57 escaped.push_back('%');
58 escaped.push_back(IntToHex(c >> 4));
59 escaped.push_back(IntToHex(c & 0xf));
60 } else {
61 escaped.push_back(c);
62 }
63 }
64 return escaped;
65 }
66
67 // Contains nonzero when the corresponding character is unescapable for normal
68 // URLs. These characters are the ones that may change the parsing of a URL, so
69 // we don't want to unescape them sometimes. In many case we won't want to
70 // unescape spaces, but that is controlled by parameters to Unescape*.
71 //
72 // The basic rule is that we can't unescape anything that would changing parsing
73 // like # or ?. We also can't unescape &, =, or + since that could be part of a
74 // query and that could change the server's parsing of the query. Nor can we
75 // unescape \ since googleurl will convert it to a /.
76 //
77 // Lastly, we can't unescape anything that doesn't have a canonical
78 // representation in a URL. This means that unescaping will change the URL, and
79 // you could get different behavior if you copy and paste the URL, or press
80 // enter in the URL bar. The list of characters that fall into this category
81 // are the ones labeled PASS (allow either escaped or unescaped) in the big
82 // lookup table at the top of googleurl/src/url_canon_path.cc
83 const char kUrlUnescape[128] = {
84 // NULL, control chars...
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87 // ' ' ! " # $ % & ' ( ) * + , - . /
88 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
89 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
90 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
91 // @ A B C D E F G H I J K L M N O
92 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
93 // P Q R S T U V W X Y Z [ \ ] ^ _
94 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
95 // ` a b c d e f g h i j k l m n o
96 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
97 // p q r s t u v w x y z { | } ~ <NBSP>
98 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
99 };
100
101 template<typename STR>
UnescapeURLWithOffsetsImpl(const STR & escaped_text,UnescapeRule::Type rules,std::vector<size_t> * offsets_for_adjustment)102 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,
103 UnescapeRule::Type rules,
104 std::vector<size_t>* offsets_for_adjustment) {
105 if (offsets_for_adjustment) {
106 std::for_each(offsets_for_adjustment->begin(),
107 offsets_for_adjustment->end(),
108 LimitOffset<std::wstring>(escaped_text.length()));
109 }
110 // Do not unescape anything, return the |escaped_text| text.
111 if (rules == UnescapeRule::NONE)
112 return escaped_text;
113
114 // The output of the unescaping is always smaller than the input, so we can
115 // reserve the input size to make sure we have enough buffer and don't have
116 // to allocate in the loop below.
117 STR result;
118 result.reserve(escaped_text.length());
119
120 AdjustEncodingOffset::Adjustments adjustments; // Locations of adjusted text.
121 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
122 if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
123 // Non ASCII character, append as is.
124 result.push_back(escaped_text[i]);
125 continue;
126 }
127
128 char current_char = static_cast<char>(escaped_text[i]);
129 if (current_char == '%' && i + 2 < max) {
130 const typename STR::value_type most_sig_digit(
131 static_cast<typename STR::value_type>(escaped_text[i + 1]));
132 const typename STR::value_type least_sig_digit(
133 static_cast<typename STR::value_type>(escaped_text[i + 2]));
134 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
135 unsigned char value = HexDigitToInt(most_sig_digit) * 16 +
136 HexDigitToInt(least_sig_digit);
137 if (value >= 0x80 || // Unescape all high-bit characters.
138 // For 7-bit characters, the lookup table tells us all valid chars.
139 (kUrlUnescape[value] ||
140 // ...and we allow some additional unescaping when flags are set.
141 (value == ' ' && (rules & UnescapeRule::SPACES)) ||
142 // Allow any of the prohibited but non-control characters when
143 // we're doing "special" chars.
144 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
145 // Additionally allow control characters if requested.
146 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
147 // Use the unescaped version of the character.
148 adjustments.push_back(i);
149 result.push_back(value);
150 i += 2;
151 } else {
152 // Keep escaped. Append a percent and we'll get the following two
153 // digits on the next loops through.
154 result.push_back('%');
155 }
156 } else {
157 // Invalid escape sequence, just pass the percent through and continue
158 // right after it.
159 result.push_back('%');
160 }
161 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
162 escaped_text[i] == '+') {
163 result.push_back(' ');
164 } else {
165 // Normal case for unescaped characters.
166 result.push_back(escaped_text[i]);
167 }
168 }
169
170 // Make offset adjustment.
171 if (offsets_for_adjustment && !adjustments.empty()) {
172 std::for_each(offsets_for_adjustment->begin(),
173 offsets_for_adjustment->end(),
174 AdjustEncodingOffset(adjustments));
175 }
176
177 return result;
178 }
179
180 template<typename STR>
UnescapeURLImpl(const STR & escaped_text,UnescapeRule::Type rules,size_t * offset_for_adjustment)181 STR UnescapeURLImpl(const STR& escaped_text,
182 UnescapeRule::Type rules,
183 size_t* offset_for_adjustment) {
184 std::vector<size_t> offsets;
185 if (offset_for_adjustment)
186 offsets.push_back(*offset_for_adjustment);
187 STR result = UnescapeURLWithOffsetsImpl(escaped_text, rules, &offsets);
188 if (offset_for_adjustment)
189 *offset_for_adjustment = offsets[0];
190 return result;
191 }
192
193 } // namespace
194
195 // Everything except alphanumerics and !'()*-._~
196 // See RFC 2396 for the list of reserved characters.
197 static const Charmap kQueryCharmap(
198 0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
199 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
200
EscapeQueryParamValue(const std::string & text,bool use_plus)201 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
202 return Escape(text, kQueryCharmap, use_plus);
203 }
204
205 // Convert the string to a sequence of bytes and then % escape anything
206 // except alphanumerics and !'()*-._~
EscapeQueryParamValueUTF8(const string16 & text,bool use_plus)207 string16 EscapeQueryParamValueUTF8(const string16& text,
208 bool use_plus) {
209 return UTF8ToUTF16(Escape(UTF16ToUTF8(text), kQueryCharmap, use_plus));
210 }
211
212 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|}
213 static const Charmap kPathCharmap(
214 0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
215 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
216
EscapePath(const std::string & path)217 std::string EscapePath(const std::string& path) {
218 return Escape(path, kPathCharmap, false);
219 }
220
221 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
222 static const Charmap kUrlEscape(
223 0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
224 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
225 );
226
EscapeUrlEncodedData(const std::string & path)227 std::string EscapeUrlEncodedData(const std::string& path) {
228 return Escape(path, kUrlEscape, true);
229 }
230
231 // non-7bit
232 static const Charmap kNonASCIICharmap(
233 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
234 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
235
EscapeNonASCII(const std::string & input)236 std::string EscapeNonASCII(const std::string& input) {
237 return Escape(input, kNonASCIICharmap, false);
238 }
239
240 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
241 // !'()*-._~%
242 static const Charmap kExternalHandlerCharmap(
243 0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L,
244 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL);
245
EscapeExternalHandlerValue(const std::string & text)246 std::string EscapeExternalHandlerValue(const std::string& text) {
247 return Escape(text, kExternalHandlerCharmap, false);
248 }
249
UnescapeAndDecodeUTF8URLComponentWithOffsets(const std::string & text,UnescapeRule::Type rules,std::vector<size_t> * offsets_for_adjustment)250 string16 UnescapeAndDecodeUTF8URLComponentWithOffsets(
251 const std::string& text,
252 UnescapeRule::Type rules,
253 std::vector<size_t>* offsets_for_adjustment) {
254 std::wstring result;
255 std::vector<size_t> original_offsets;
256 if (offsets_for_adjustment)
257 original_offsets = *offsets_for_adjustment;
258 std::string unescaped_url(
259 UnescapeURLWithOffsetsImpl(text, rules, offsets_for_adjustment));
260 if (UTF8ToWideAndAdjustOffsets(unescaped_url.data(), unescaped_url.length(),
261 &result, offsets_for_adjustment))
262 return WideToUTF16Hack(result); // Character set looks like it's valid.
263
264 // Not valid. Return the escaped version. Undo our changes to
265 // |offset_for_adjustment| since we haven't changed the string after all.
266 if (offsets_for_adjustment)
267 *offsets_for_adjustment = original_offsets;
268 return WideToUTF16Hack(UTF8ToWideAndAdjustOffsets(
269 text, offsets_for_adjustment));
270 }
271
UnescapeAndDecodeUTF8URLComponent(const std::string & text,UnescapeRule::Type rules,size_t * offset_for_adjustment)272 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
273 UnescapeRule::Type rules,
274 size_t* offset_for_adjustment) {
275 std::vector<size_t> offsets;
276 if (offset_for_adjustment)
277 offsets.push_back(*offset_for_adjustment);
278 string16 result =
279 UnescapeAndDecodeUTF8URLComponentWithOffsets(text, rules, &offsets);
280 if (offset_for_adjustment)
281 *offset_for_adjustment = offsets[0];
282 return result;
283 }
284
UnescapeURLComponent(const std::string & escaped_text,UnescapeRule::Type rules)285 std::string UnescapeURLComponent(const std::string& escaped_text,
286 UnescapeRule::Type rules) {
287 return UnescapeURLWithOffsetsImpl<std::string>(escaped_text, rules, NULL);
288 }
289
UnescapeURLComponent(const string16 & escaped_text,UnescapeRule::Type rules)290 string16 UnescapeURLComponent(const string16& escaped_text,
291 UnescapeRule::Type rules) {
292 return UnescapeURLWithOffsetsImpl<string16>(escaped_text, rules, NULL);
293 }
294
295
296 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)297 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
298 static const struct {
299 char key;
300 const char* replacement;
301 } kCharsToEscape[] = {
302 { '<', "<" },
303 { '>', ">" },
304 { '&', "&" },
305 { '"', """ },
306 { '\'', "'" },
307 };
308 size_t k;
309 for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) {
310 if (c == kCharsToEscape[k].key) {
311 const char* p = kCharsToEscape[k].replacement;
312 while (*p)
313 output->push_back(*p++);
314 break;
315 }
316 }
317 if (k == ARRAYSIZE_UNSAFE(kCharsToEscape))
318 output->push_back(c);
319 }
320
AppendEscapedCharForHTML(char c,std::string * output)321 void AppendEscapedCharForHTML(char c, std::string* output) {
322 AppendEscapedCharForHTMLImpl(c, output);
323 }
324
AppendEscapedCharForHTML(wchar_t c,string16 * output)325 void AppendEscapedCharForHTML(wchar_t c, string16* output) {
326 AppendEscapedCharForHTMLImpl(c, output);
327 }
328
329 template <class str>
EscapeForHTMLImpl(const str & input)330 str EscapeForHTMLImpl(const str& input) {
331 str result;
332 result.reserve(input.size()); // optimize for no escaping
333
334 for (typename str::const_iterator it = input.begin(); it != input.end(); ++it)
335 AppendEscapedCharForHTMLImpl(*it, &result);
336
337 return result;
338 }
339
EscapeForHTML(const std::string & input)340 std::string EscapeForHTML(const std::string& input) {
341 return EscapeForHTMLImpl(input);
342 }
343
EscapeForHTML(const string16 & input)344 string16 EscapeForHTML(const string16& input) {
345 return EscapeForHTMLImpl(input);
346 }
347
UnescapeForHTML(const string16 & input)348 string16 UnescapeForHTML(const string16& input) {
349 static const struct {
350 const wchar_t* ampersand_code;
351 const char replacement;
352 } kEscapeToChars[] = {
353 { L"<", '<' },
354 { L">", '>' },
355 { L"&", '&' },
356 { L""", '"' },
357 { L"'", '\''},
358 };
359
360 if (input.find(WideToUTF16(L"&")) == std::string::npos)
361 return input;
362
363 string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
364 string16 text(input);
365 for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) {
366 if (*iter == '&') {
367 // Potential ampersand encode char.
368 size_t index = iter - text.begin();
369 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
370 if (ampersand_chars[i].empty())
371 ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code);
372 if (text.find(ampersand_chars[i], index) == index) {
373 text.replace(iter, iter + ampersand_chars[i].length(),
374 1, kEscapeToChars[i].replacement);
375 break;
376 }
377 }
378 }
379 }
380 return text;
381 }
382
AdjustEncodingOffset(const Adjustments & adjustments)383 AdjustEncodingOffset::AdjustEncodingOffset(const Adjustments& adjustments)
384 : adjustments(adjustments) {}
385
operator ()(size_t & offset)386 void AdjustEncodingOffset::operator()(size_t& offset) {
387 // For each encoded character occurring before an offset subtract 2.
388 if (offset == string16::npos)
389 return;
390 size_t adjusted_offset = offset;
391 for (Adjustments::const_iterator i = adjustments.begin();
392 i != adjustments.end(); ++i) {
393 size_t location = *i;
394 if (offset <= location) {
395 offset = adjusted_offset;
396 return;
397 }
398 if (offset <= (location + 2)) {
399 offset = string16::npos;
400 return;
401 }
402 adjusted_offset -= 2;
403 }
404 offset = adjusted_offset;
405 }
406