• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef NET_BASE_ESCAPE_H_
6 #define NET_BASE_ESCAPE_H_
7 #pragma once
8 
9 #include <string>
10 #include <vector>
11 
12 #include "base/basictypes.h"
13 #include "base/string16.h"
14 
15 // Escaping --------------------------------------------------------------------
16 
17 // Escape a file.  This includes:
18 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
19 std::string EscapePath(const std::string& path);
20 
21 // Escape application/x-www-form-urlencoded content.  This includes:
22 // non-printable, non-7bit, and (including space)  ?>=<;+'&%$#"![\]^`{|}
23 // Space is escaped as + and other special characters as %XX (hex).
24 std::string EscapeUrlEncodedData(const std::string& path);
25 
26 // Escape all non-ASCII input.
27 std::string EscapeNonASCII(const std::string& input);
28 
29 // Escapes characters in text suitable for use as an external protocol handler
30 // command.
31 // We %XX everything except alphanumerics and %-_.!~*'() and the restricted
32 // chracters (;/?:@&=+$,).
33 std::string EscapeExternalHandlerValue(const std::string& text);
34 
35 // Append the given character to the output string, escaping the character if
36 // the character would be interpretted as an HTML delimiter.
37 void AppendEscapedCharForHTML(char c, std::string* output);
38 
39 // Escape chars that might cause this text to be interpretted as HTML tags.
40 std::string EscapeForHTML(const std::string& text);
41 string16 EscapeForHTML(const string16& text);
42 
43 // Unescaping ------------------------------------------------------------------
44 
45 class UnescapeRule {
46  public:
47   // A combination of the following flags that is passed to the unescaping
48   // functions.
49   typedef uint32 Type;
50 
51   enum {
52     // Don't unescape anything at all.
53     NONE = 0,
54 
55     // Don't unescape anything special, but all normal unescaping will happen.
56     // This is a placeholder and can't be combined with other flags (since it's
57     // just the absence of them). All other unescape rules imply "normal" in
58     // addition to their special meaning. Things like escaped letters, digits,
59     // and most symbols will get unescaped with this mode.
60     NORMAL = 1,
61 
62     // Convert %20 to spaces. In some places where we're showing URLs, we may
63     // want this. In places where the URL may be copied and pasted out, then
64     // you wouldn't want this since it might not be interpreted in one piece
65     // by other applications.
66     SPACES = 2,
67 
68     // Unescapes various characters that will change the meaning of URLs,
69     // including '%', '+', '&', '/', '#'. If we unescaped these characters, the
70     // resulting URL won't be the same as the source one. This flag is used when
71     // generating final output like filenames for URLs where we won't be
72     // interpreting as a URL and want to do as much unescaping as possible.
73     URL_SPECIAL_CHARS = 4,
74 
75     // Unescapes control characters such as %01. This INCLUDES NULLs. This is
76     // used for rare cases such as data: URL decoding where the result is binary
77     // data. You should not use this for normal URLs!
78     CONTROL_CHARS = 8,
79 
80     // URL queries use "+" for space. This flag controls that replacement.
81     REPLACE_PLUS_WITH_SPACE = 16,
82   };
83 };
84 
85 // Unescapes |escaped_text| and returns the result.
86 // Unescaping consists of looking for the exact pattern "%XX", where each X is
87 // a hex digit, and converting to the character with the numerical value of
88 // those digits. Thus "i%20=%203%3b" unescapes to "i = 3;".
89 //
90 // Watch out: this doesn't necessarily result in the correct final result,
91 // because the encoding may be unknown. For example, the input might be ASCII,
92 // which, after unescaping, is supposed to be interpreted as UTF-8, and then
93 // converted into full wide chars. This function won't tell you if any
94 // conversions need to take place, it only unescapes.
95 std::string UnescapeURLComponent(const std::string& escaped_text,
96                                  UnescapeRule::Type rules);
97 string16 UnescapeURLComponent(const string16& escaped_text,
98                               UnescapeRule::Type rules);
99 
100 // Unescapes the given substring as a URL, and then tries to interpret the
101 // result as being encoded as UTF-8. If the result is convertable into UTF-8, it
102 // will be returned as converted. If it is not, the original escaped string will
103 // be converted into a string16 and returned. (|offset[s]_for_adjustment|)
104 // specifies one or more offsets into the source strings; each offset will be
105 // adjusted to point at the same logical place in the result strings during
106 // decoding.  If this isn't possible because an offset points past the end of
107 // the source strings or into the middle of a multibyte sequence, the offending
108 // offset will be set to std::wstring::npos. |offset[s]_for_adjustment| may be
109 // NULL.
110 string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
111                                            UnescapeRule::Type rules,
112                                            size_t* offset_for_adjustment);
113 string16 UnescapeAndDecodeUTF8URLComponentWithOffsets(
114     const std::string& text,
115     UnescapeRule::Type rules,
116     std::vector<size_t>* offsets_for_adjustment);
117 
118 // Unescape the following ampersand character codes from |text|:
119 // &lt; &gt; &amp; &quot; &#39;
120 string16 UnescapeForHTML(const string16& text);
121 
122 // Deprecated ------------------------------------------------------------------
123 
124 // Escapes characters in text suitable for use as a query parameter value.
125 // We %XX everything except alphanumerics and -_.!~*'()
126 // Spaces change to "+" unless you pass usePlus=false.
127 // This is basically the same as encodeURIComponent in javascript.
128 // For the string16 version, we do a conversion to charset before encoding the
129 // string.  If the charset doesn't exist, we return false.
130 std::string EscapeQueryParamValue(const std::string& text, bool use_plus);
131 bool EscapeQueryParamValue(const string16& text, const char* codepage,
132                            bool use_plus, string16* escaped);
133 
134 // A specialized version of EscapeQueryParamValue for string16s that
135 // assumes the codepage is UTF8.  This is provided as a convenience.
136 string16 EscapeQueryParamValueUTF8(const string16& text, bool use_plus);
137 
138 // Private Functions (Exposed for Unit Testing) --------------------------------
139 
140 // A function called by std::for_each that will adjust any offset which occurs
141 // after one or more encoded characters.
142 struct AdjustEncodingOffset {
143   typedef std::vector<size_t> Adjustments;
144 
145   explicit AdjustEncodingOffset(const Adjustments& adjustments);
146   void operator()(size_t& offset);
147 
148   const Adjustments& adjustments;
149 };
150 
151 #endif  // NET_BASE_ESCAPE_H_
152