• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #include "net/http/http_content_disposition.h"
11 
12 #include <string_view>
13 
14 #include "base/base64.h"
15 #include "base/check_op.h"
16 #include "base/strings/escape.h"
17 #include "base/strings/string_tokenizer.h"
18 #include "base/strings/string_util.h"
19 #include "base/strings/sys_string_conversions.h"
20 #include "base/strings/utf_string_conversions.h"
21 #include "net/base/net_string_util.h"
22 #include "net/http/http_util.h"
23 
24 namespace net {
25 
26 namespace {
27 
28 enum RFC2047EncodingType {
29   Q_ENCODING,
30   B_ENCODING
31 };
32 
33 // Decodes a "Q" encoded string as described in RFC 2047 section 4.2. Similar to
34 // decoding a quoted-printable string.  Returns true if the input was valid.
DecodeQEncoding(std::string_view input,std::string * output)35 bool DecodeQEncoding(std::string_view input, std::string* output) {
36   std::string temp;
37   temp.reserve(input.size());
38   for (auto it = input.begin(); it != input.end(); ++it) {
39     if (*it == '_') {
40       temp.push_back(' ');
41     } else if (*it == '=') {
42       if ((input.end() - it < 3) ||
43           !base::IsHexDigit(static_cast<unsigned char>(*(it + 1))) ||
44           !base::IsHexDigit(static_cast<unsigned char>(*(it + 2))))
45         return false;
46       unsigned char ch =
47           base::HexDigitToInt(*(it + 1)) * 16 + base::HexDigitToInt(*(it + 2));
48       temp.push_back(static_cast<char>(ch));
49       ++it;
50       ++it;
51     } else if (0x20 < *it && *it < 0x7F && *it != '?') {
52       // In a Q-encoded word, only printable ASCII characters
53       // represent themselves. Besides, space, '=', '_' and '?' are
54       // not allowed, but they're already filtered out.
55       DCHECK_NE('=', *it);
56       DCHECK_NE('?', *it);
57       DCHECK_NE('_', *it);
58       temp.push_back(*it);
59     } else {
60       return false;
61     }
62   }
63   output->swap(temp);
64   return true;
65 }
66 
67 // Decodes a "Q" or "B" encoded string as per RFC 2047 section 4. The encoding
68 // type is specified in |enc_type|.
DecodeBQEncoding(std::string_view part,RFC2047EncodingType enc_type,const std::string & charset,std::string * output)69 bool DecodeBQEncoding(std::string_view part,
70                       RFC2047EncodingType enc_type,
71                       const std::string& charset,
72                       std::string* output) {
73   std::string decoded;
74   if (!((enc_type == B_ENCODING) ?
75         base::Base64Decode(part, &decoded) : DecodeQEncoding(part, &decoded))) {
76     return false;
77   }
78 
79   if (decoded.empty()) {
80     output->clear();
81     return true;
82   }
83 
84   return ConvertToUtf8(decoded, charset.c_str(), output);
85 }
86 
DecodeWord(std::string_view encoded_word,const std::string & referrer_charset,bool * is_rfc2047,std::string * output,int * parse_result_flags)87 bool DecodeWord(std::string_view encoded_word,
88                 const std::string& referrer_charset,
89                 bool* is_rfc2047,
90                 std::string* output,
91                 int* parse_result_flags) {
92   *is_rfc2047 = false;
93   output->clear();
94   if (encoded_word.empty())
95     return true;
96 
97   if (!base::IsStringASCII(encoded_word)) {
98     // Try UTF-8, referrer_charset and the native OS default charset in turn.
99     if (base::IsStringUTF8(encoded_word)) {
100       *output = std::string(encoded_word);
101     } else {
102       std::u16string utf16_output;
103       if (!referrer_charset.empty() &&
104           ConvertToUTF16(encoded_word, referrer_charset.c_str(),
105                          &utf16_output)) {
106         *output = base::UTF16ToUTF8(utf16_output);
107       } else {
108         *output = base::WideToUTF8(base::SysNativeMBToWide(encoded_word));
109       }
110     }
111 
112     *parse_result_flags |= HttpContentDisposition::HAS_NON_ASCII_STRINGS;
113     return true;
114   }
115 
116   // RFC 2047 : one of encoding methods supported by Firefox and relatively
117   // widely used by web servers.
118   // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
119   // We don't care about the length restriction (72 bytes) because
120   // many web servers generate encoded words longer than the limit.
121   std::string decoded_word;
122   *is_rfc2047 = true;
123   int part_index = 0;
124   std::string charset;
125   base::CStringTokenizer t(encoded_word.data(),
126                            encoded_word.data() + encoded_word.size(), "?");
127   RFC2047EncodingType enc_type = Q_ENCODING;
128   while (*is_rfc2047 && t.GetNext()) {
129     std::string_view part = t.token_piece();
130     switch (part_index) {
131       case 0:
132         if (part != "=") {
133           *is_rfc2047 = false;
134           break;
135         }
136         ++part_index;
137         break;
138       case 1:
139         // Do we need charset validity check here?
140         charset = std::string(part);
141         ++part_index;
142         break;
143       case 2:
144         if (part.size() > 1 ||
145             part.find_first_of("bBqQ") == std::string::npos) {
146           *is_rfc2047 = false;
147           break;
148         }
149         if (part[0] == 'b' || part[0] == 'B') {
150           enc_type = B_ENCODING;
151         }
152         ++part_index;
153         break;
154       case 3:
155         *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &decoded_word);
156         if (!*is_rfc2047) {
157           // Last minute failure. Invalid B/Q encoding. Rather than
158           // passing it through, return now.
159           return false;
160         }
161         ++part_index;
162         break;
163       case 4:
164         if (part != "=") {
165           // Another last minute failure !
166           // Likely to be a case of two encoded-words in a row or
167           // an encoded word followed by a non-encoded word. We can be
168           // generous, but it does not help much in terms of compatibility,
169           // I believe. Return immediately.
170           *is_rfc2047 = false;
171           return false;
172         }
173         ++part_index;
174         break;
175       default:
176         *is_rfc2047 = false;
177         return false;
178     }
179   }
180 
181   if (*is_rfc2047) {
182     if (*(encoded_word.end() - 1) == '=') {
183       output->swap(decoded_word);
184       *parse_result_flags |=
185           HttpContentDisposition::HAS_RFC2047_ENCODED_STRINGS;
186       return true;
187     }
188     // encoded_word ending prematurelly with '?' or extra '?'
189     *is_rfc2047 = false;
190     return false;
191   }
192 
193   // We're not handling 'especial' characters quoted with '\', but
194   // it should be Ok because we're not an email client but a
195   // web browser.
196 
197   // What IE6/7 does: %-escaped UTF-8.
198   decoded_word = base::UnescapeBinaryURLComponent(encoded_word,
199                                                   base::UnescapeRule::NORMAL);
200   if (decoded_word != encoded_word)
201     *parse_result_flags |= HttpContentDisposition::HAS_PERCENT_ENCODED_STRINGS;
202   if (base::IsStringUTF8(decoded_word)) {
203     output->swap(decoded_word);
204     return true;
205     // We can try either the OS default charset or 'origin charset' here,
206     // As far as I can tell, IE does not support it. However, I've seen
207     // web servers emit %-escaped string in a legacy encoding (usually
208     // origin charset).
209     // TODO(jungshik) : Test IE further and consider adding a fallback here.
210   }
211   return false;
212 }
213 
214 // Decodes the value of a 'filename' or 'name' parameter given as |input|. The
215 // value is supposed to be of the form:
216 //
217 //   value                   = token | quoted-string
218 //
219 // However we currently also allow RFC 2047 encoding and non-ASCII
220 // strings. Non-ASCII strings are interpreted based on |referrer_charset|.
DecodeFilenameValue(std::string_view input,const std::string & referrer_charset,std::string * output,int * parse_result_flags)221 bool DecodeFilenameValue(std::string_view input,
222                          const std::string& referrer_charset,
223                          std::string* output,
224                          int* parse_result_flags) {
225   int current_parse_result_flags = 0;
226   std::string decoded_value;
227   bool is_previous_token_rfc2047 = true;
228 
229   // Tokenize with whitespace characters.
230   base::StringViewTokenizer t(input, " \t\n\r");
231   t.set_options(base::StringViewTokenizer::RETURN_DELIMS);
232   while (t.GetNext()) {
233     if (t.token_is_delim()) {
234       // If the previous non-delimeter token is not RFC2047-encoded,
235       // put in a space in its place. Otheriwse, skip over it.
236       if (!is_previous_token_rfc2047)
237         decoded_value.push_back(' ');
238       continue;
239     }
240     // We don't support a single multibyte character split into
241     // adjacent encoded words. Some broken mail clients emit headers
242     // with that problem, but most web servers usually encode a filename
243     // in a single encoded-word. Firefox/Thunderbird do not support
244     // it, either.
245     std::string decoded;
246     if (!DecodeWord(t.token_piece(), referrer_charset,
247                     &is_previous_token_rfc2047, &decoded,
248                     &current_parse_result_flags))
249       return false;
250     decoded_value.append(decoded);
251   }
252   output->swap(decoded_value);
253   if (parse_result_flags && !output->empty())
254     *parse_result_flags |= current_parse_result_flags;
255   return true;
256 }
257 
258 // Parses the charset and value-chars out of an ext-value string.
259 //
260 //  ext-value     = charset  "'" [ language ] "'" value-chars
ParseExtValueComponents(std::string_view input,std::string * charset,std::string * value_chars)261 bool ParseExtValueComponents(std::string_view input,
262                              std::string* charset,
263                              std::string* value_chars) {
264   base::StringViewTokenizer t(input, "'");
265   t.set_options(base::StringTokenizer::RETURN_DELIMS);
266   std::string_view temp_charset;
267   std::string_view temp_value;
268   int num_delims_seen = 0;
269   while (t.GetNext()) {
270     if (t.token_is_delim()) {
271       ++num_delims_seen;
272       continue;
273     } else {
274       switch (num_delims_seen) {
275         case 0:
276           temp_charset = t.token_piece();
277           break;
278         case 1:
279           // Language is ignored.
280           break;
281         case 2:
282           temp_value = t.token_piece();
283           break;
284         default:
285           return false;
286       }
287     }
288   }
289   if (num_delims_seen != 2)
290     return false;
291   if (temp_charset.empty() || temp_value.empty())
292     return false;
293   *charset = std::string(temp_charset);
294   *value_chars = std::string(temp_value);
295   return true;
296 }
297 
298 // http://tools.ietf.org/html/rfc5987#section-3.2
299 //
300 //  ext-value     = charset  "'" [ language ] "'" value-chars
301 //
302 //  charset       = "UTF-8" / "ISO-8859-1" / mime-charset
303 //
304 //  mime-charset  = 1*mime-charsetc
305 //  mime-charsetc = ALPHA / DIGIT
306 //                 / "!" / "#" / "$" / "%" / "&"
307 //                 / "+" / "-" / "^" / "_" / "`"
308 //                 / "{" / "}" / "~"
309 //
310 //  language      = <Language-Tag, defined in [RFC5646], Section 2.1>
311 //
312 //  value-chars   = *( pct-encoded / attr-char )
313 //
314 //  pct-encoded   = "%" HEXDIG HEXDIG
315 //
316 //  attr-char     = ALPHA / DIGIT
317 //                 / "!" / "#" / "$" / "&" / "+" / "-" / "."
318 //                 / "^" / "_" / "`" / "|" / "~"
DecodeExtValue(std::string_view param_value,std::string * decoded)319 bool DecodeExtValue(std::string_view param_value, std::string* decoded) {
320   if (param_value.find('"') != std::string::npos)
321     return false;
322 
323   std::string charset;
324   std::string value;
325   if (!ParseExtValueComponents(param_value, &charset, &value))
326     return false;
327 
328   // RFC 5987 value should be ASCII-only.
329   if (!base::IsStringASCII(value)) {
330     decoded->clear();
331     return true;
332   }
333 
334   std::string unescaped =
335       base::UnescapeBinaryURLComponent(value, base::UnescapeRule::NORMAL);
336 
337   return ConvertToUtf8AndNormalize(unescaped, charset.c_str(), decoded);
338 }
339 
340 } // namespace
341 
HttpContentDisposition(const std::string & header,const std::string & referrer_charset)342 HttpContentDisposition::HttpContentDisposition(
343     const std::string& header,
344     const std::string& referrer_charset) {
345   Parse(header, referrer_charset);
346 }
347 
348 HttpContentDisposition::~HttpContentDisposition() = default;
349 
ConsumeDispositionType(std::string::const_iterator begin,std::string::const_iterator end)350 std::string::const_iterator HttpContentDisposition::ConsumeDispositionType(
351     std::string::const_iterator begin, std::string::const_iterator end) {
352   DCHECK(type_ == INLINE);
353   auto header = base::MakeStringPiece(begin, end);
354   size_t delimiter = header.find(';');
355   std::string_view type = header.substr(0, delimiter);
356   type = HttpUtil::TrimLWS(type);
357 
358   // If the disposition-type isn't a valid token the then the
359   // Content-Disposition header is malformed, and we treat the first bytes as
360   // a parameter rather than a disposition-type.
361   if (type.empty() || !HttpUtil::IsToken(type))
362     return begin;
363 
364   parse_result_flags_ |= HAS_DISPOSITION_TYPE;
365 
366   DCHECK(type.find('=') == std::string_view::npos);
367 
368   if (base::EqualsCaseInsensitiveASCII(type, "inline")) {
369     type_ = INLINE;
370   } else if (base::EqualsCaseInsensitiveASCII(type, "attachment")) {
371     type_ = ATTACHMENT;
372   } else {
373     parse_result_flags_ |= HAS_UNKNOWN_DISPOSITION_TYPE;
374     type_ = ATTACHMENT;
375   }
376   return begin + (type.data() + type.size() - header.data());
377 }
378 
379 // http://tools.ietf.org/html/rfc6266
380 //
381 //  content-disposition = "Content-Disposition" ":"
382 //                         disposition-type *( ";" disposition-parm )
383 //
384 //  disposition-type    = "inline" | "attachment" | disp-ext-type
385 //                      ; case-insensitive
386 //  disp-ext-type       = token
387 //
388 //  disposition-parm    = filename-parm | disp-ext-parm
389 //
390 //  filename-parm       = "filename" "=" value
391 //                      | "filename*" "=" ext-value
392 //
393 //  disp-ext-parm       = token "=" value
394 //                      | ext-token "=" ext-value
395 //  ext-token           = <the characters in token, followed by "*">
396 //
Parse(const std::string & header,const std::string & referrer_charset)397 void HttpContentDisposition::Parse(const std::string& header,
398                                    const std::string& referrer_charset) {
399   DCHECK(type_ == INLINE);
400   DCHECK(filename_.empty());
401 
402   std::string::const_iterator pos = header.begin();
403   std::string::const_iterator end = header.end();
404   pos = ConsumeDispositionType(pos, end);
405 
406   std::string filename;
407   std::string ext_filename;
408 
409   HttpUtil::NameValuePairsIterator iter(base::MakeStringPiece(pos, end), ';');
410   while (iter.GetNext()) {
411     if (filename.empty() &&
412         base::EqualsCaseInsensitiveASCII(iter.name(), "filename")) {
413       DecodeFilenameValue(iter.value(), referrer_charset, &filename,
414                           &parse_result_flags_);
415       if (!filename.empty()) {
416         parse_result_flags_ |= HAS_FILENAME;
417         if (filename[0] == '\'')
418           parse_result_flags_ |= HAS_SINGLE_QUOTED_FILENAME;
419       }
420     } else if (ext_filename.empty() &&
421                base::EqualsCaseInsensitiveASCII(iter.name(), "filename*")) {
422       DecodeExtValue(iter.raw_value(), &ext_filename);
423       if (!ext_filename.empty())
424         parse_result_flags_ |= HAS_EXT_FILENAME;
425     }
426   }
427 
428   if (!ext_filename.empty())
429     filename_ = ext_filename;
430   else
431     filename_ = filename;
432 
433   if (!filename.empty() && filename[0] == '\'')
434     parse_result_flags_ |= HAS_SINGLE_QUOTED_FILENAME;
435 }
436 
437 }  // namespace net
438