1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/http/http_content_disposition.h"
6
7 #include "base/base64.h"
8 #include "base/check_op.h"
9 #include "base/strings/escape.h"
10 #include "base/strings/string_piece.h"
11 #include "base/strings/string_tokenizer.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/sys_string_conversions.h"
14 #include "base/strings/utf_string_conversions.h"
15 #include "net/base/net_string_util.h"
16 #include "net/http/http_util.h"
17
18 namespace net {
19
20 namespace {
21
22 enum RFC2047EncodingType {
23 Q_ENCODING,
24 B_ENCODING
25 };
26
27 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
28 // decoding a quoted-printable string. Returns true if the input was valid.
DecodeQEncoding(base::StringPiece input,std::string * output)29 bool DecodeQEncoding(base::StringPiece input, std::string* output) {
30 std::string temp;
31 temp.reserve(input.size());
32 for (auto* it = input.begin(); it != input.end(); ++it) {
33 if (*it == '_') {
34 temp.push_back(' ');
35 } else if (*it == '=') {
36 if ((input.end() - it < 3) ||
37 !base::IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
38 !base::IsHexDigit(static_cast<unsigned char>(*(it + 2))))
39 return false;
40 unsigned char ch =
41 base::HexDigitToInt(*(it + 1)) * 16 + base::HexDigitToInt(*(it + 2));
42 temp.push_back(static_cast<char>(ch));
43 ++it;
44 ++it;
45 } else if (0x20 < *it && *it < 0x7F && *it != '?') {
46 // In a Q-encoded word, only printable ASCII characters
47 // represent themselves. Besides, space, '=', '_' and '?' are
48 // not allowed, but they're already filtered out.
49 DCHECK_NE('=', *it);
50 DCHECK_NE('?', *it);
51 DCHECK_NE('_', *it);
52 temp.push_back(*it);
53 } else {
54 return false;
55 }
56 }
57 output->swap(temp);
58 return true;
59 }
60
61 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
62 // type is specified in |enc_type|.
DecodeBQEncoding(base::StringPiece part,RFC2047EncodingType enc_type,const std::string & charset,std::string * output)63 bool DecodeBQEncoding(base::StringPiece part,
64 RFC2047EncodingType enc_type,
65 const std::string& charset,
66 std::string* output) {
67 std::string decoded;
68 if (!((enc_type == B_ENCODING) ?
69 base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) {
70 return false;
71 }
72
73 if (decoded.empty()) {
74 output->clear();
75 return true;
76 }
77
78 return ConvertToUtf8(decoded, charset.c_str(), output);
79 }
80
DecodeWord(base::StringPiece encoded_word,const std::string & referrer_charset,bool * is_rfc2047,std::string * output,int * parse_result_flags)81 bool DecodeWord(base::StringPiece encoded_word,
82 const std::string& referrer_charset,
83 bool* is_rfc2047,
84 std::string* output,
85 int* parse_result_flags) {
86 *is_rfc2047 = false;
87 output->clear();
88 if (encoded_word.empty())
89 return true;
90
91 if (!base::IsStringASCII(encoded_word)) {
92 // Try UTF-8, referrer_charset and the native OS default charset in turn.
93 if (base::IsStringUTF8(encoded_word)) {
94 *output = std::string(encoded_word);
95 } else {
96 std::u16string utf16_output;
97 if (!referrer_charset.empty() &&
98 ConvertToUTF16(encoded_word, referrer_charset.c_str(),
99 &utf16_output)) {
100 *output = base::UTF16ToUTF8(utf16_output);
101 } else {
102 *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word));
103 }
104 }
105
106 *parse_result_flags |= HttpContentDisposition::HAS_NON_ASCII_STRINGS;
107 return true;
108 }
109
110 // RFC 2047 : one of encoding methods supported by Firefox and relatively
111 // widely used by web servers.
112 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
113 // We don't care about the length restriction (72 bytes) because
114 // many web servers generate encoded words longer than the limit.
115 std::string decoded_word;
116 *is_rfc2047 = true;
117 int part_index = 0;
118 std::string charset;
119 base::CStringTokenizer t(encoded_word.data(),
120 encoded_word.data() + encoded_word.size(), "?");
121 RFC2047EncodingType enc_type = Q_ENCODING;
122 while (*is_rfc2047 && t.GetNext()) {
123 base::StringPiece part = t.token_piece();
124 switch (part_index) {
125 case 0:
126 if (part != "=") {
127 *is_rfc2047 = false;
128 break;
129 }
130 ++part_index;
131 break;
132 case 1:
133 // Do we need charset validity check here?
134 charset = std::string(part);
135 ++part_index;
136 break;
137 case 2:
138 if (part.size() > 1 ||
139 part.find_first_of("bBqQ") == std::string::npos) {
140 *is_rfc2047 = false;
141 break;
142 }
143 if (part[0] == 'b' || part[0] == 'B') {
144 enc_type = B_ENCODING;
145 }
146 ++part_index;
147 break;
148 case 3:
149 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
150 if (!*is_rfc2047) {
151 // Last minute failure. Invalid B/Q encoding. Rather than
152 // passing it through, return now.
153 return false;
154 }
155 ++part_index;
156 break;
157 case 4:
158 if (part != "=") {
159 // Another last minute failure !
160 // Likely to be a case of two encoded-words in a row or
161 // an encoded word followed by a non-encoded word. We can be
162 // generous, but it does not help much in terms of compatibility,
163 // I believe. Return immediately.
164 *is_rfc2047 = false;
165 return false;
166 }
167 ++part_index;
168 break;
169 default:
170 *is_rfc2047 = false;
171 return false;
172 }
173 }
174
175 if (*is_rfc2047) {
176 if (*(encoded_word.end() - 1) == '=') {
177 output->swap(decoded_word);
178 *parse_result_flags |=
179 HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
180 return true;
181 }
182 // encoded_word ending prematurelly with '?' or extra '?'
183 *is_rfc2047 = false;
184 return false;
185 }
186
187 // We're not handling 'especial' characters quoted with '\', but
188 // it should be Ok because we're not an email client but a
189 // web browser.
190
191 // What IE6/7 does: %-escaped UTF-8.
192 decoded_word = base::UnescapeBinaryURLComponent(encoded_word,
193 base::UnescapeRule::NORMAL);
194 if (decoded_word != encoded_word)
195 *parse_result_flags |= HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
196 if (base::IsStringUTF8(decoded_word)) {
197 output->swap(decoded_word);
198 return true;
199 // We can try either the OS default charset or 'origin charset' here,
200 // As far as I can tell, IE does not support it. However, I've seen
201 // web servers emit %-escaped string in a legacy encoding (usually
202 // origin charset).
203 // TODO(jungshik) : Test IE further and consider adding a fallback here.
204 }
205 return false;
206 }
207
208 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
209 // value is supposed to be of the form:
210 //
211 // value = token | quoted-string
212 //
213 // However we currently also allow RFC 2047 encoding and non-ASCII
214 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
DecodeFilenameValue(const std::string & input,const std::string & referrer_charset,std::string * output,int * parse_result_flags)215 bool DecodeFilenameValue(const std::string& input,
216 const std::string& referrer_charset,
217 std::string* output,
218 int* parse_result_flags) {
219 int current_parse_result_flags = 0;
220 std::string decoded_value;
221 bool is_previous_token_rfc2047 = true;
222
223 // Tokenize with whitespace characters.
224 base::StringTokenizer t(input, " \t\n\r");
225 t.set_options(base::StringTokenizer::RETURN_DELIMS);
226 while (t.GetNext()) {
227 if (t.token_is_delim()) {
228 // If the previous non-delimeter token is not RFC2047-encoded,
229 // put in a space in its place. Otheriwse, skip over it.
230 if (!is_previous_token_rfc2047)
231 decoded_value.push_back(' ');
232 continue;
233 }
234 // We don't support a single multibyte character split into
235 // adjacent encoded words. Some broken mail clients emit headers
236 // with that problem, but most web servers usually encode a filename
237 // in a single encoded-word. Firefox/Thunderbird do not support
238 // it, either.
239 std::string decoded;
240 if (!DecodeWord(t.token_piece(), referrer_charset,
241 &is_previous_token_rfc2047, &decoded,
242 ¤t_parse_result_flags))
243 return false;
244 decoded_value.append(decoded);
245 }
246 output->swap(decoded_value);
247 if (parse_result_flags && !output->empty())
248 *parse_result_flags |= current_parse_result_flags;
249 return true;
250 }
251
252 // Parses the charset and value-chars out of an ext-value string.
253 //
254 // ext-value = charset "'" [ language ] "'" value-chars
ParseExtValueComponents(const std::string & input,std::string * charset,std::string * value_chars)255 bool ParseExtValueComponents(const std::string& input,
256 std::string* charset,
257 std::string* value_chars) {
258 base::StringTokenizer t(input, "'");
259 t.set_options(base::StringTokenizer::RETURN_DELIMS);
260 base::StringPiece temp_charset;
261 base::StringPiece temp_value;
262 int num_delims_seen = 0;
263 while (t.GetNext()) {
264 if (t.token_is_delim()) {
265 ++num_delims_seen;
266 continue;
267 } else {
268 switch (num_delims_seen) {
269 case 0:
270 temp_charset = t.token_piece();
271 break;
272 case 1:
273 // Language is ignored.
274 break;
275 case 2:
276 temp_value = t.token_piece();
277 break;
278 default:
279 return false;
280 }
281 }
282 }
283 if (num_delims_seen != 2)
284 return false;
285 if (temp_charset.empty() || temp_value.empty())
286 return false;
287 *charset = std::string(temp_charset);
288 *value_chars = std::string(temp_value);
289 return true;
290 }
291
292 // http://tools.ietf.org/html/rfc5987#section-3.2
293 //
294 // ext-value = charset "'" [ language ] "'" value-chars
295 //
296 // charset = "UTF-8" / "ISO-8859-1" / mime-charset
297 //
298 // mime-charset = 1*mime-charsetc
299 // mime-charsetc = ALPHA / DIGIT
300 // / "!" / "#" / "$" / "%" / "&"
301 // / "+" / "-" / "^" / "_" / "`"
302 // / "{" / "}" / "~"
303 //
304 // language = <Language-Tag, defined in [RFC5646], Section 2.1>
305 //
306 // value-chars = *( pct-encoded / attr-char )
307 //
308 // pct-encoded = "%" HEXDIG HEXDIG
309 //
310 // attr-char = ALPHA / DIGIT
311 // / "!" / "#" / "$" / "&" / "+" / "-" / "."
312 // / "^" / "_" / "`" / "|" / "~"
DecodeExtValue(const std::string & param_value,std::string * decoded)313 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
314 if (param_value.find('"') != std::string::npos)
315 return false;
316
317 std::string charset;
318 std::string value;
319 if (!ParseExtValueComponents(param_value, &charset, &value))
320 return false;
321
322 // RFC 5987 value should be ASCII-only.
323 if (!base::IsStringASCII(value)) {
324 decoded->clear();
325 return true;
326 }
327
328 std::string unescaped =
329 base::UnescapeBinaryURLComponent(value, base::UnescapeRule::NORMAL);
330
331 return ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded);
332 }
333
334 } // namespace
335
HttpContentDisposition(const std::string & header,const std::string & referrer_charset)336 HttpContentDisposition::HttpContentDisposition(
337 const std::string& header,
338 const std::string& referrer_charset) {
339 Parse(header, referrer_charset);
340 }
341
342 HttpContentDisposition::~HttpContentDisposition() = default;
343
ConsumeDispositionType(std::string::const_iterator begin,std::string::const_iterator end)344 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
345 std::string::const_iterator begin, std::string::const_iterator end) {
346 DCHECK(type_ == INLINE);
347 auto header = base::MakeStringPiece(begin, end);
348 size_t delimiter = header.find(';');
349 base::StringPiece type = header.substr(0, delimiter);
350 type = HttpUtil::TrimLWS(type);
351
352 // If the disposition-type isn't a valid token the then the
353 // Content-Disposition header is malformed, and we treat the first bytes as
354 // a parameter rather than a disposition-type.
355 if (type.empty() || !HttpUtil::IsToken(type))
356 return begin;
357
358 parse_result_flags_ |= HAS_DISPOSITION_TYPE;
359
360 DCHECK(type.find('=') == base::StringPiece::npos);
361
362 if (base::EqualsCaseInsensitiveASCII(type, "inline")) {
363 type_ = INLINE;
364 } else if (base::EqualsCaseInsensitiveASCII(type, "attachment")) {
365 type_ = ATTACHMENT;
366 } else {
367 parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
368 type_ = ATTACHMENT;
369 }
370 return begin + (type.data() + type.size() - header.data());
371 }
372
373 // http://tools.ietf.org/html/rfc6266
374 //
375 // content-disposition = "Content-Disposition" ":"
376 // disposition-type *( ";" disposition-parm )
377 //
378 // disposition-type = "inline" | "attachment" | disp-ext-type
379 // ; case-insensitive
380 // disp-ext-type = token
381 //
382 // disposition-parm = filename-parm | disp-ext-parm
383 //
384 // filename-parm = "filename" "=" value
385 // | "filename*" "=" ext-value
386 //
387 // disp-ext-parm = token "=" value
388 // | ext-token "=" ext-value
389 // ext-token = <the characters in token, followed by "*">
390 //
Parse(const std::string & header,const std::string & referrer_charset)391 void HttpContentDisposition::Parse(const std::string& header,
392 const std::string& referrer_charset) {
393 DCHECK(type_ == INLINE);
394 DCHECK(filename_.empty());
395
396 std::string::const_iterator pos = header.begin();
397 std::string::const_iterator end = header.end();
398 pos = ConsumeDispositionType(pos, end);
399
400 std::string filename;
401 std::string ext_filename;
402
403 HttpUtil::NameValuePairsIterator iter(pos, end, ';');
404 while (iter.GetNext()) {
405 if (filename.empty() &&
406 base::EqualsCaseInsensitiveASCII(iter.name_piece(), "filename")) {
407 DecodeFilenameValue(iter.value(), referrer_charset, &filename,
408 &parse_result_flags_);
409 if (!filename.empty()) {
410 parse_result_flags_ |= HAS_FILENAME;
411 if (filename[0] == '\'')
412 parse_result_flags_ |= HAS_SINGLE_QUOTED_FILENAME;
413 }
414 } else if (ext_filename.empty() && base::EqualsCaseInsensitiveASCII(
415 iter.name_piece(), "filename*")) {
416 DecodeExtValue(iter.raw_value(), &ext_filename);
417 if (!ext_filename.empty())
418 parse_result_flags_ |= HAS_EXT_FILENAME;
419 }
420 }
421
422 if (!ext_filename.empty())
423 filename_ = ext_filename;
424 else
425 filename_ = filename;
426
427 if (!filename.empty() && filename[0] == '\'')
428 parse_result_flags_ |= HAS_SINGLE_QUOTED_FILENAME;
429 }
430
431 } // namespace net
432