1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/base/escape.h"
6
7 #include <algorithm>
8
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string_piece.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_offset_string_conversions.h"
14 #include "base/strings/utf_string_conversions.h"
15
16 namespace net {
17
18 namespace {
19
20 const char kHexString[] = "0123456789ABCDEF";
IntToHex(int i)21 inline char IntToHex(int i) {
22 DCHECK_GE(i, 0) << i << " not a hex value";
23 DCHECK_LE(i, 15) << i << " not a hex value";
24 return kHexString[i];
25 }
26
27 // A fast bit-vector map for ascii characters.
28 //
29 // Internally stores 256 bits in an array of 8 ints.
30 // Does quick bit-flicking to lookup needed characters.
31 struct Charmap {
Containsnet::__anon70938e3a0111::Charmap32 bool Contains(unsigned char c) const {
33 return ((map[c >> 5] & (1 << (c & 31))) != 0);
34 }
35
36 uint32 map[8];
37 };
38
39 // Given text to escape and a Charmap defining which values to escape,
40 // return an escaped string. If use_plus is true, spaces are converted
41 // to +, otherwise, if spaces are in the charmap, they are converted to
42 // %20.
Escape(const std::string & text,const Charmap & charmap,bool use_plus)43 std::string Escape(const std::string& text, const Charmap& charmap,
44 bool use_plus) {
45 std::string escaped;
46 escaped.reserve(text.length() * 3);
47 for (unsigned int i = 0; i < text.length(); ++i) {
48 unsigned char c = static_cast<unsigned char>(text[i]);
49 if (use_plus && ' ' == c) {
50 escaped.push_back('+');
51 } else if (charmap.Contains(c)) {
52 escaped.push_back('%');
53 escaped.push_back(IntToHex(c >> 4));
54 escaped.push_back(IntToHex(c & 0xf));
55 } else {
56 escaped.push_back(c);
57 }
58 }
59 return escaped;
60 }
61
62 // Contains nonzero when the corresponding character is unescapable for normal
63 // URLs. These characters are the ones that may change the parsing of a URL, so
64 // we don't want to unescape them sometimes. In many case we won't want to
65 // unescape spaces, but that is controlled by parameters to Unescape*.
66 //
67 // The basic rule is that we can't unescape anything that would changing parsing
68 // like # or ?. We also can't unescape &, =, or + since that could be part of a
69 // query and that could change the server's parsing of the query. Nor can we
70 // unescape \ since src/url/ will convert it to a /.
71 //
72 // Lastly, we can't unescape anything that doesn't have a canonical
73 // representation in a URL. This means that unescaping will change the URL, and
74 // you could get different behavior if you copy and paste the URL, or press
75 // enter in the URL bar. The list of characters that fall into this category
76 // are the ones labeled PASS (allow either escaped or unescaped) in the big
77 // lookup table at the top of url/url_canon_path.cc. Also, characters
78 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
79 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
80 // not unescaped, to avoid turning a valid url according to spec into an
81 // invalid one.
82 const char kUrlUnescape[128] = {
83 // NULL, control chars...
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86 // ' ' ! " # $ % & ' ( ) * + , - . /
87 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
88 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
89 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
90 // @ A B C D E F G H I J K L M N O
91 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
92 // P Q R S T U V W X Y Z [ \ ] ^ _
93 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
94 // ` a b c d e f g h i j k l m n o
95 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
96 // p q r s t u v w x y z { | } ~ <NBSP>
97 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0
98 };
99
100 template<typename STR>
UnescapeURLWithOffsetsImpl(const STR & escaped_text,UnescapeRule::Type rules,std::vector<size_t> * offsets_for_adjustment)101 STR UnescapeURLWithOffsetsImpl(const STR& escaped_text,
102 UnescapeRule::Type rules,
103 std::vector<size_t>* offsets_for_adjustment) {
104 if (offsets_for_adjustment) {
105 std::for_each(offsets_for_adjustment->begin(),
106 offsets_for_adjustment->end(),
107 base::LimitOffset<STR>(escaped_text.length()));
108 }
109 // Do not unescape anything, return the |escaped_text| text.
110 if (rules == UnescapeRule::NONE)
111 return escaped_text;
112
113 // The output of the unescaping is always smaller than the input, so we can
114 // reserve the input size to make sure we have enough buffer and don't have
115 // to allocate in the loop below.
116 STR result;
117 result.reserve(escaped_text.length());
118
119 // Locations of adjusted text.
120 net::internal::AdjustEncodingOffset::Adjustments adjustments;
121 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
122 if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
123 // Non ASCII character, append as is.
124 result.push_back(escaped_text[i]);
125 continue;
126 }
127
128 char current_char = static_cast<char>(escaped_text[i]);
129 if (current_char == '%' && i + 2 < max) {
130 const typename STR::value_type most_sig_digit(
131 static_cast<typename STR::value_type>(escaped_text[i + 1]));
132 const typename STR::value_type least_sig_digit(
133 static_cast<typename STR::value_type>(escaped_text[i + 2]));
134 if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
135 unsigned char value = HexDigitToInt(most_sig_digit) * 16 +
136 HexDigitToInt(least_sig_digit);
137 if (value >= 0x80 || // Unescape all high-bit characters.
138 // For 7-bit characters, the lookup table tells us all valid chars.
139 (kUrlUnescape[value] ||
140 // ...and we allow some additional unescaping when flags are set.
141 (value == ' ' && (rules & UnescapeRule::SPACES)) ||
142 // Allow any of the prohibited but non-control characters when
143 // we're doing "special" chars.
144 (value > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
145 // Additionally allow control characters if requested.
146 (value < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
147 // Use the unescaped version of the character.
148 adjustments.push_back(i);
149 result.push_back(value);
150 i += 2;
151 } else {
152 // Keep escaped. Append a percent and we'll get the following two
153 // digits on the next loops through.
154 result.push_back('%');
155 }
156 } else {
157 // Invalid escape sequence, just pass the percent through and continue
158 // right after it.
159 result.push_back('%');
160 }
161 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
162 escaped_text[i] == '+') {
163 result.push_back(' ');
164 } else {
165 // Normal case for unescaped characters.
166 result.push_back(escaped_text[i]);
167 }
168 }
169
170 // Make offset adjustment.
171 if (offsets_for_adjustment && !adjustments.empty()) {
172 std::for_each(offsets_for_adjustment->begin(),
173 offsets_for_adjustment->end(),
174 net::internal::AdjustEncodingOffset(adjustments));
175 }
176
177 return result;
178 }
179
180 template <class str>
AppendEscapedCharForHTMLImpl(typename str::value_type c,str * output)181 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
182 static const struct {
183 char key;
184 const char* replacement;
185 } kCharsToEscape[] = {
186 { '<', "<" },
187 { '>', ">" },
188 { '&', "&" },
189 { '"', """ },
190 { '\'', "'" },
191 };
192 size_t k;
193 for (k = 0; k < ARRAYSIZE_UNSAFE(kCharsToEscape); ++k) {
194 if (c == kCharsToEscape[k].key) {
195 const char* p = kCharsToEscape[k].replacement;
196 while (*p)
197 output->push_back(*p++);
198 break;
199 }
200 }
201 if (k == ARRAYSIZE_UNSAFE(kCharsToEscape))
202 output->push_back(c);
203 }
204
205 template <class str>
EscapeForHTMLImpl(const str & input)206 str EscapeForHTMLImpl(const str& input) {
207 str result;
208 result.reserve(input.size()); // Optimize for no escaping.
209
210 for (typename str::const_iterator i = input.begin(); i != input.end(); ++i)
211 AppendEscapedCharForHTMLImpl(*i, &result);
212
213 return result;
214 }
215
216 // Everything except alphanumerics and !'()*-._~
217 // See RFC 2396 for the list of reserved characters.
218 static const Charmap kQueryCharmap = {{
219 0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
220 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
221 }};
222
223 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|}
224 static const Charmap kPathCharmap = {{
225 0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
226 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
227 }};
228
229 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
230 static const Charmap kUrlEscape = {{
231 0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
232 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
233 }};
234
235 // non-7bit
236 static const Charmap kNonASCIICharmap = {{
237 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
238 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
239 }};
240
241 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
242 // !'()*-._~%
243 static const Charmap kExternalHandlerCharmap = {{
244 0xffffffffL, 0x5000080dL, 0x68000000L, 0xb8000001L,
245 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
246 }};
247
248 } // namespace
249
EscapeQueryParamValue(const std::string & text,bool use_plus)250 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
251 return Escape(text, kQueryCharmap, use_plus);
252 }
253
EscapePath(const std::string & path)254 std::string EscapePath(const std::string& path) {
255 return Escape(path, kPathCharmap, false);
256 }
257
EscapeUrlEncodedData(const std::string & path,bool use_plus)258 std::string EscapeUrlEncodedData(const std::string& path, bool use_plus) {
259 return Escape(path, kUrlEscape, use_plus);
260 }
261
EscapeNonASCII(const std::string & input)262 std::string EscapeNonASCII(const std::string& input) {
263 return Escape(input, kNonASCIICharmap, false);
264 }
265
EscapeExternalHandlerValue(const std::string & text)266 std::string EscapeExternalHandlerValue(const std::string& text) {
267 return Escape(text, kExternalHandlerCharmap, false);
268 }
269
AppendEscapedCharForHTML(char c,std::string * output)270 void AppendEscapedCharForHTML(char c, std::string* output) {
271 AppendEscapedCharForHTMLImpl(c, output);
272 }
273
EscapeForHTML(const std::string & input)274 std::string EscapeForHTML(const std::string& input) {
275 return EscapeForHTMLImpl(input);
276 }
277
EscapeForHTML(const base::string16 & input)278 base::string16 EscapeForHTML(const base::string16& input) {
279 return EscapeForHTMLImpl(input);
280 }
281
UnescapeURLComponent(const std::string & escaped_text,UnescapeRule::Type rules)282 std::string UnescapeURLComponent(const std::string& escaped_text,
283 UnescapeRule::Type rules) {
284 return UnescapeURLWithOffsetsImpl(escaped_text, rules, NULL);
285 }
286
UnescapeURLComponent(const base::string16 & escaped_text,UnescapeRule::Type rules)287 base::string16 UnescapeURLComponent(const base::string16& escaped_text,
288 UnescapeRule::Type rules) {
289 return UnescapeURLWithOffsetsImpl(escaped_text, rules, NULL);
290 }
291
UnescapeAndDecodeUTF8URLComponent(const std::string & text,UnescapeRule::Type rules,size_t * offset_for_adjustment)292 base::string16 UnescapeAndDecodeUTF8URLComponent(
293 const std::string& text,
294 UnescapeRule::Type rules,
295 size_t* offset_for_adjustment) {
296 std::vector<size_t> offsets;
297 if (offset_for_adjustment)
298 offsets.push_back(*offset_for_adjustment);
299 base::string16 result =
300 UnescapeAndDecodeUTF8URLComponentWithOffsets(text, rules, &offsets);
301 if (offset_for_adjustment)
302 *offset_for_adjustment = offsets[0];
303 return result;
304 }
305
UnescapeAndDecodeUTF8URLComponentWithOffsets(const std::string & text,UnescapeRule::Type rules,std::vector<size_t> * offsets_for_adjustment)306 base::string16 UnescapeAndDecodeUTF8URLComponentWithOffsets(
307 const std::string& text,
308 UnescapeRule::Type rules,
309 std::vector<size_t>* offsets_for_adjustment) {
310 base::string16 result;
311 std::vector<size_t> original_offsets;
312 if (offsets_for_adjustment)
313 original_offsets = *offsets_for_adjustment;
314 std::string unescaped_url(
315 UnescapeURLWithOffsetsImpl(text, rules, offsets_for_adjustment));
316 if (base::UTF8ToUTF16AndAdjustOffsets(unescaped_url.data(),
317 unescaped_url.length(),
318 &result, offsets_for_adjustment))
319 return result; // Character set looks like it's valid.
320
321 // Not valid. Return the escaped version. Undo our changes to
322 // |offset_for_adjustment| since we haven't changed the string after all.
323 if (offsets_for_adjustment)
324 *offsets_for_adjustment = original_offsets;
325 return base::UTF8ToUTF16AndAdjustOffsets(text, offsets_for_adjustment);
326 }
327
UnescapeForHTML(const base::string16 & input)328 base::string16 UnescapeForHTML(const base::string16& input) {
329 static const struct {
330 const char* ampersand_code;
331 const char replacement;
332 } kEscapeToChars[] = {
333 { "<", '<' },
334 { ">", '>' },
335 { "&", '&' },
336 { """, '"' },
337 { "'", '\''},
338 };
339
340 if (input.find(ASCIIToUTF16("&")) == std::string::npos)
341 return input;
342
343 base::string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
344 base::string16 text(input);
345 for (base::string16::iterator iter = text.begin();
346 iter != text.end(); ++iter) {
347 if (*iter == '&') {
348 // Potential ampersand encode char.
349 size_t index = iter - text.begin();
350 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
351 if (ampersand_chars[i].empty())
352 ampersand_chars[i] = ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
353 if (text.find(ampersand_chars[i], index) == index) {
354 text.replace(iter, iter + ampersand_chars[i].length(),
355 1, kEscapeToChars[i].replacement);
356 break;
357 }
358 }
359 }
360 }
361 return text;
362 }
363
364 namespace internal {
365
AdjustEncodingOffset(const Adjustments & adjustments)366 AdjustEncodingOffset::AdjustEncodingOffset(const Adjustments& adjustments)
367 : adjustments(adjustments) {}
368
operator ()(size_t & offset)369 void AdjustEncodingOffset::operator()(size_t& offset) {
370 // For each encoded character occurring before an offset subtract 2.
371 if (offset == base::string16::npos)
372 return;
373 size_t adjusted_offset = offset;
374 for (Adjustments::const_iterator i = adjustments.begin();
375 i != adjustments.end(); ++i) {
376 size_t location = *i;
377 if (offset <= location) {
378 offset = adjusted_offset;
379 return;
380 }
381 if (offset <= (location + 2)) {
382 offset = base::string16::npos;
383 return;
384 }
385 adjusted_offset -= 2;
386 }
387 offset = adjusted_offset;
388 }
389
390 } // namespace internal
391
392 } // namespace net
393