• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/http/http_content_disposition.h"
6 
7 #include "base/base64.h"
8 #include "base/check_op.h"
9 #include "base/strings/escape.h"
10 #include "base/strings/string_piece.h"
11 #include "base/strings/string_tokenizer.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/sys_string_conversions.h"
14 #include "base/strings/utf_string_conversions.h"
15 #include "net/base/net_string_util.h"
16 #include "net/http/http_util.h"
17 
18 namespace net {
19 
20 namespace {
21 
22 enum RFC2047EncodingType {
23   Q_ENCODING,
24   B_ENCODING
25 };
26 
27 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
28 // decoding a quoted-printable string.  Returns true if the input was valid.
DecodeQEncoding(base::StringPiece input,std::string * output)29 bool DecodeQEncoding(base::StringPiece input, std::string* output) {
30   std::string temp;
31   temp.reserve(input.size());
32   for (auto* it = input.begin(); it != input.end(); ++it) {
33     if (*it == '_') {
34       temp.push_back(' ');
35     } else if (*it == '=') {
36       if ((input.end() - it < 3) ||
37           !base::IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
38           !base::IsHexDigit(static_cast<unsigned char>(*(it + 2))))
39         return false;
40       unsigned char ch =
41           base::HexDigitToInt(*(it + 1)) * 16 + base::HexDigitToInt(*(it + 2));
42       temp.push_back(static_cast<char>(ch));
43       ++it;
44       ++it;
45     } else if (0x20 < *it && *it < 0x7F && *it != '?') {
46       // In a Q-encoded word, only printable ASCII characters
47       // represent themselves. Besides, space, '=', '_' and '?' are
48       // not allowed, but they're already filtered out.
49       DCHECK_NE('=', *it);
50       DCHECK_NE('?', *it);
51       DCHECK_NE('_', *it);
52       temp.push_back(*it);
53     } else {
54       return false;
55     }
56   }
57   output->swap(temp);
58   return true;
59 }
60 
61 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
62 // type is specified in |enc_type|.
DecodeBQEncoding(base::StringPiece part,RFC2047EncodingType enc_type,const std::string & charset,std::string * output)63 bool DecodeBQEncoding(base::StringPiece part,
64                       RFC2047EncodingType enc_type,
65                       const std::string& charset,
66                       std::string* output) {
67   std::string decoded;
68   if (!((enc_type == B_ENCODING) ?
69         base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) {
70     return false;
71   }
72 
73   if (decoded.empty()) {
74     output->clear();
75     return true;
76   }
77 
78   return ConvertToUtf8(decoded, charset.c_str(), output);
79 }
80 
DecodeWord(base::StringPiece encoded_word,const std::string & referrer_charset,bool * is_rfc2047,std::string * output,int * parse_result_flags)81 bool DecodeWord(base::StringPiece encoded_word,
82                 const std::string& referrer_charset,
83                 bool* is_rfc2047,
84                 std::string* output,
85                 int* parse_result_flags) {
86   *is_rfc2047 = false;
87   output->clear();
88   if (encoded_word.empty())
89     return true;
90 
91   if (!base::IsStringASCII(encoded_word)) {
92     // Try UTF-8, referrer_charset and the native OS default charset in turn.
93     if (base::IsStringUTF8(encoded_word)) {
94       *output = std::string(encoded_word);
95     } else {
96       std::u16string utf16_output;
97       if (!referrer_charset.empty() &&
98           ConvertToUTF16(encoded_word, referrer_charset.c_str(),
99                          &utf16_output)) {
100         *output = base::UTF16ToUTF8(utf16_output);
101       } else {
102         *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word));
103       }
104     }
105 
106     *parse_result_flags |= HttpContentDisposition::HAS_NON_ASCII_STRINGS;
107     return true;
108   }
109 
110   // RFC 2047 : one of encoding methods supported by Firefox and relatively
111   // widely used by web servers.
112   // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
113   // We don't care about the length restriction (72 bytes) because
114   // many web servers generate encoded words longer than the limit.
115   std::string decoded_word;
116   *is_rfc2047 = true;
117   int part_index = 0;
118   std::string charset;
119   base::CStringTokenizer t(encoded_word.data(),
120                            encoded_word.data() + encoded_word.size(), "?");
121   RFC2047EncodingType enc_type = Q_ENCODING;
122   while (*is_rfc2047 && t.GetNext()) {
123     base::StringPiece part = t.token_piece();
124     switch (part_index) {
125       case 0:
126         if (part != "=") {
127           *is_rfc2047 = false;
128           break;
129         }
130         ++part_index;
131         break;
132       case 1:
133         // Do we need charset validity check here?
134         charset = std::string(part);
135         ++part_index;
136         break;
137       case 2:
138         if (part.size() > 1 ||
139             part.find_first_of("bBqQ") == std::string::npos) {
140           *is_rfc2047 = false;
141           break;
142         }
143         if (part[0] == 'b' || part[0] == 'B') {
144           enc_type = B_ENCODING;
145         }
146         ++part_index;
147         break;
148       case 3:
149         *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
150         if (!*is_rfc2047) {
151           // Last minute failure. Invalid B/Q encoding. Rather than
152           // passing it through, return now.
153           return false;
154         }
155         ++part_index;
156         break;
157       case 4:
158         if (part != "=") {
159           // Another last minute failure !
160           // Likely to be a case of two encoded-words in a row or
161           // an encoded word followed by a non-encoded word. We can be
162           // generous, but it does not help much in terms of compatibility,
163           // I believe. Return immediately.
164           *is_rfc2047 = false;
165           return false;
166         }
167         ++part_index;
168         break;
169       default:
170         *is_rfc2047 = false;
171         return false;
172     }
173   }
174 
175   if (*is_rfc2047) {
176     if (*(encoded_word.end() - 1) == '=') {
177       output->swap(decoded_word);
178       *parse_result_flags |=
179           HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
180       return true;
181     }
182     // encoded_word ending prematurelly with '?' or extra '?'
183     *is_rfc2047 = false;
184     return false;
185   }
186 
187   // We're not handling 'especial' characters quoted with '\', but
188   // it should be Ok because we're not an email client but a
189   // web browser.
190 
191   // What IE6/7 does: %-escaped UTF-8.
192   decoded_word = base::UnescapeBinaryURLComponent(encoded_word,
193                                                   base::UnescapeRule::NORMAL);
194   if (decoded_word != encoded_word)
195     *parse_result_flags |= HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
196   if (base::IsStringUTF8(decoded_word)) {
197     output->swap(decoded_word);
198     return true;
199     // We can try either the OS default charset or 'origin charset' here,
200     // As far as I can tell, IE does not support it. However, I've seen
201     // web servers emit %-escaped string in a legacy encoding (usually
202     // origin charset).
203     // TODO(jungshik) : Test IE further and consider adding a fallback here.
204   }
205   return false;
206 }
207 
208 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
209 // value is supposed to be of the form:
210 //
211 //   value                   = token | quoted-string
212 //
213 // However we currently also allow RFC 2047 encoding and non-ASCII
214 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
DecodeFilenameValue(const std::string & input,const std::string & referrer_charset,std::string * output,int * parse_result_flags)215 bool DecodeFilenameValue(const std::string& input,
216                          const std::string& referrer_charset,
217                          std::string* output,
218                          int* parse_result_flags) {
219   int current_parse_result_flags = 0;
220   std::string decoded_value;
221   bool is_previous_token_rfc2047 = true;
222 
223   // Tokenize with whitespace characters.
224   base::StringTokenizer t(input, " \t\n\r");
225   t.set_options(base::StringTokenizer::RETURN_DELIMS);
226   while (t.GetNext()) {
227     if (t.token_is_delim()) {
228       // If the previous non-delimeter token is not RFC2047-encoded,
229       // put in a space in its place. Otheriwse, skip over it.
230       if (!is_previous_token_rfc2047)
231         decoded_value.push_back(' ');
232       continue;
233     }
234     // We don't support a single multibyte character split into
235     // adjacent encoded words. Some broken mail clients emit headers
236     // with that problem, but most web servers usually encode a filename
237     // in a single encoded-word. Firefox/Thunderbird do not support
238     // it, either.
239     std::string decoded;
240     if (!DecodeWord(t.token_piece(), referrer_charset,
241                     &is_previous_token_rfc2047, &decoded,
242                     &current_parse_result_flags))
243       return false;
244     decoded_value.append(decoded);
245   }
246   output->swap(decoded_value);
247   if (parse_result_flags && !output->empty())
248     *parse_result_flags |= current_parse_result_flags;
249   return true;
250 }
251 
252 // Parses the charset and value-chars out of an ext-value string.
253 //
254 //  ext-value     = charset  "'" [ language ] "'" value-chars
ParseExtValueComponents(const std::string & input,std::string * charset,std::string * value_chars)255 bool ParseExtValueComponents(const std::string& input,
256                              std::string* charset,
257                              std::string* value_chars) {
258   base::StringTokenizer t(input, "'");
259   t.set_options(base::StringTokenizer::RETURN_DELIMS);
260   base::StringPiece temp_charset;
261   base::StringPiece temp_value;
262   int num_delims_seen = 0;
263   while (t.GetNext()) {
264     if (t.token_is_delim()) {
265       ++num_delims_seen;
266       continue;
267     } else {
268       switch (num_delims_seen) {
269         case 0:
270           temp_charset = t.token_piece();
271           break;
272         case 1:
273           // Language is ignored.
274           break;
275         case 2:
276           temp_value = t.token_piece();
277           break;
278         default:
279           return false;
280       }
281     }
282   }
283   if (num_delims_seen != 2)
284     return false;
285   if (temp_charset.empty() || temp_value.empty())
286     return false;
287   *charset = std::string(temp_charset);
288   *value_chars = std::string(temp_value);
289   return true;
290 }
291 
292 // http://tools.ietf.org/html/rfc5987#section-3.2
293 //
294 //  ext-value     = charset  "'" [ language ] "'" value-chars
295 //
296 //  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
297 //
298 //  mime-charset  = 1*mime-charsetc
299 //  mime-charsetc = ALPHA / DIGIT
300 //                 / "!" / "#" / "$" / "%" / "&"
301 //                 / "+" / "-" / "^" / "_" / "`"
302 //                 / "{" / "}" / "~"
303 //
304 //  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
305 //
306 //  value-chars   = *( pct-encoded / attr-char )
307 //
308 //  pct-encoded   = "%" HEXDIG HEXDIG
309 //
310 //  attr-char     = ALPHA / DIGIT
311 //                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
312 //                 / "^" / "_" / "`" / "|" / "~"
DecodeExtValue(const std::string & param_value,std::string * decoded)313 bool DecodeExtValue(const std::string& param_value, std::string* decoded) {
314   if (param_value.find('"') != std::string::npos)
315     return false;
316 
317   std::string charset;
318   std::string value;
319   if (!ParseExtValueComponents(param_value, &charset, &value))
320     return false;
321 
322   // RFC 5987 value should be ASCII-only.
323   if (!base::IsStringASCII(value)) {
324     decoded->clear();
325     return true;
326   }
327 
328   std::string unescaped =
329       base::UnescapeBinaryURLComponent(value, base::UnescapeRule::NORMAL);
330 
331   return ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded);
332 }
333 
334 } // namespace
335 
HttpContentDisposition(const std::string & header,const std::string & referrer_charset)336 HttpContentDisposition::HttpContentDisposition(
337     const std::string& header,
338     const std::string& referrer_charset) {
339   Parse(header, referrer_charset);
340 }
341 
342 HttpContentDisposition::~HttpContentDisposition() = default;
343 
ConsumeDispositionType(std::string::const_iterator begin,std::string::const_iterator end)344 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
345     std::string::const_iterator begin, std::string::const_iterator end) {
346   DCHECK(type_ == INLINE);
347   auto header = base::MakeStringPiece(begin, end);
348   size_t delimiter = header.find(';');
349   base::StringPiece type = header.substr(0, delimiter);
350   type = HttpUtil::TrimLWS(type);
351 
352   // If the disposition-type isn't a valid token the then the
353   // Content-Disposition header is malformed, and we treat the first bytes as
354   // a parameter rather than a disposition-type.
355   if (type.empty() || !HttpUtil::IsToken(type))
356     return begin;
357 
358   parse_result_flags_ |= HAS_DISPOSITION_TYPE;
359 
360   DCHECK(type.find('=') == base::StringPiece::npos);
361 
362   if (base::EqualsCaseInsensitiveASCII(type, "inline")) {
363     type_ = INLINE;
364   } else if (base::EqualsCaseInsensitiveASCII(type, "attachment")) {
365     type_ = ATTACHMENT;
366   } else {
367     parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
368     type_ = ATTACHMENT;
369   }
370   return begin + (type.data() + type.size() - header.data());
371 }
372 
373 // http://tools.ietf.org/html/rfc6266
374 //
375 //  content-disposition = "Content-Disposition" ":"
376 //                         disposition-type *( ";" disposition-parm )
377 //
378 //  disposition-type    = "inline" | "attachment" | disp-ext-type
379 //                      ; case-insensitive
380 //  disp-ext-type       = token
381 //
382 //  disposition-parm    = filename-parm | disp-ext-parm
383 //
384 //  filename-parm       = "filename" "=" value
385 //                      | "filename*" "=" ext-value
386 //
387 //  disp-ext-parm       = token "=" value
388 //                      | ext-token "=" ext-value
389 //  ext-token           = <the characters in token, followed by "*">
390 //
Parse(const std::string & header,const std::string & referrer_charset)391 void HttpContentDisposition::Parse(const std::string& header,
392                                    const std::string& referrer_charset) {
393   DCHECK(type_ == INLINE);
394   DCHECK(filename_.empty());
395 
396   std::string::const_iterator pos = header.begin();
397   std::string::const_iterator end = header.end();
398   pos = ConsumeDispositionType(pos, end);
399 
400   std::string filename;
401   std::string ext_filename;
402 
403   HttpUtil::NameValuePairsIterator iter(pos, end, ';');
404   while (iter.GetNext()) {
405     if (filename.empty() &&
406         base::EqualsCaseInsensitiveASCII(iter.name_piece(), "filename")) {
407       DecodeFilenameValue(iter.value(), referrer_charset, &filename,
408                           &parse_result_flags_);
409       if (!filename.empty()) {
410         parse_result_flags_ |= HAS_FILENAME;
411         if (filename[0] == '\'')
412           parse_result_flags_ |= HAS_SINGLE_QUOTED_FILENAME;
413       }
414     } else if (ext_filename.empty() && base::EqualsCaseInsensitiveASCII(
415                                            iter.name_piece(), "filename*")) {
416       DecodeExtValue(iter.raw_value(), &ext_filename);
417       if (!ext_filename.empty())
418         parse_result_flags_ |= HAS_EXT_FILENAME;
419     }
420   }
421 
422   if (!ext_filename.empty())
423     filename_ = ext_filename;
424   else
425     filename_ = filename;
426 
427   if (!filename.empty() && filename[0] == '\'')
428     parse_result_flags_ |= HAS_SINGLE_QUOTED_FILENAME;
429 }
430 
431 }  // namespace net
432