• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Detecting mime types is a tricky business because we need to balance
6 // compatibility concerns with security issues.  Here is a survey of how other
7 // browsers behave and then a description of how we intend to behave.
8 //
9 // HTML payload, no Content-Type header:
10 // * IE 7: Render as HTML
11 // * Firefox 2: Render as HTML
12 // * Safari 3: Render as HTML
13 // * Opera 9: Render as HTML
14 //
15 // Here the choice seems clear:
16 // => Chrome: Render as HTML
17 //
18 // HTML payload, Content-Type: "text/plain":
19 // * IE 7: Render as HTML
20 // * Firefox 2: Render as text
21 // * Safari 3: Render as text (Note: Safari will Render as HTML if the URL
22 //                                   has an HTML extension)
23 // * Opera 9: Render as text
24 //
25 // Here we choose to follow the majority (and break some compatibility with IE).
26 // Many folks dislike IE's behavior here.
27 // => Chrome: Render as text
28 // We generalize this as follows.  If the Content-Type header is text/plain
29 // we won't detect dangerous mime types (those that can execute script).
30 //
31 // HTML payload, Content-Type: "application/octet-stream":
32 // * IE 7: Render as HTML
33 // * Firefox 2: Download as application/octet-stream
34 // * Safari 3: Render as HTML
35 // * Opera 9: Render as HTML
36 //
37 // We follow Firefox.
38 // => Chrome: Download as application/octet-stream
39 // One factor in this decision is that IIS 4 and 5 will send
40 // application/octet-stream for .xhtml files (because they don't recognize
41 // the extension).  We did some experiments and it looks like this doesn't occur
42 // very often on the web.  We choose the more secure option.
43 //
44 // GIF payload, no Content-Type header:
45 // * IE 7: Render as GIF
46 // * Firefox 2: Render as GIF
47 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
48 //                                        URL has an GIF extension)
49 // * Opera 9: Render as GIF
50 //
51 // The choice is clear.
52 // => Chrome: Render as GIF
53 // Once we decide to render HTML without a Content-Type header, there isn't much
54 // reason not to render GIFs.
55 //
56 // GIF payload, Content-Type: "text/plain":
57 // * IE 7: Render as GIF
58 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
59 //                              Download as GIF if the URL has an GIF extension)
60 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
61 //                                        URL has an GIF extension)
62 // * Opera 9: Render as GIF
63 //
64 // Displaying as text/plain makes little sense as the content will look like
65 // gibberish.  Here, we could change our minds and download.
66 // => Chrome: Render as GIF
67 //
68 // GIF payload, Content-Type: "application/octet-stream":
69 // * IE 7: Render as GIF
70 // * Firefox 2: Download as application/octet-stream (Note: Firefox will
71 //                              Download as GIF if the URL has an GIF extension)
72 // * Safari 3: Download as Unknown (Note: Safari will Render as GIF if the
73 //                                        URL has an GIF extension)
74 // * Opera 9: Render as GIF
75 //
76 // We used to render as GIF here, but the problem is that some sites want to
77 // trigger downloads by sending application/octet-stream (even though they
78 // should be sending Content-Disposition: attachment).  Although it is safe
79 // to render as GIF from a security perspective, we actually get better
80 // compatibility if we don't sniff from application/octet stream at all.
81 // => Chrome: Download as application/octet-stream
82 //
83 // Note that our definition of HTML payload is much stricter than IE's
84 // definition and roughly the same as Firefox's definition.
85 
86 #include <stdint.h>
87 #include <string>
88 
89 #include "net/base/mime_sniffer.h"
90 
91 #include "base/check_op.h"
92 #include "base/containers/span.h"
93 #include "base/notreached.h"
94 #include "base/strings/string_util.h"
95 #include "build/build_config.h"
96 #include "url/gurl.h"
97 
98 namespace net {
99 
100 // The number of content bytes we need to use all our magic numbers.  Feel free
101 // to increase this number if you add a longer magic number.
102 static const size_t kBytesRequiredForMagic = 42;
103 
104 struct MagicNumber {
105   const char* const mime_type;
106   const std::string_view magic;
107   bool is_string;
108   const char* const mask;  // if set, must have same length as |magic|
109 };
110 
111 #define MAGIC_NUMBER(mime_type, magic) \
112   { (mime_type), std::string_view((magic), sizeof(magic) - 1), false, nullptr }
113 
114 template <int MagicSize, int MaskSize>
115 class VerifySizes {
116   static_assert(MagicSize == MaskSize, "sizes must be equal");
117 
118  public:
119   enum { SIZES = MagicSize };
120 };
121 
122 #define verified_sizeof(magic, mask) \
123 VerifySizes<sizeof(magic), sizeof(mask)>::SIZES
124 
125 #define MAGIC_MASK(mime_type, magic, mask)                                    \
126   {                                                                           \
127     (mime_type), std::string_view((magic), verified_sizeof(magic, mask) - 1), \
128         false, (mask)                                                         \
129   }
130 
131 // Magic strings are case insensitive and must not include '\0' characters
132 #define MAGIC_STRING(mime_type, magic) \
133   { (mime_type), std::string_view((magic), sizeof(magic) - 1), true, nullptr }
134 
135 static const MagicNumber kMagicNumbers[] = {
136   // Source: HTML 5 specification
137   MAGIC_NUMBER("application/pdf", "%PDF-"),
138   MAGIC_NUMBER("application/postscript", "%!PS-Adobe-"),
139   MAGIC_NUMBER("image/gif", "GIF87a"),
140   MAGIC_NUMBER("image/gif", "GIF89a"),
141   MAGIC_NUMBER("image/png", "\x89" "PNG\x0D\x0A\x1A\x0A"),
142   MAGIC_NUMBER("image/jpeg", "\xFF\xD8\xFF"),
143   MAGIC_NUMBER("image/bmp", "BM"),
144   // Source: Mozilla
145   MAGIC_NUMBER("text/plain", "#!"),  // Script
146   MAGIC_NUMBER("text/plain", "%!"),  // Script, similar to PS
147   MAGIC_NUMBER("text/plain", "From"),
148   MAGIC_NUMBER("text/plain", ">From"),
149   // Chrome specific
150   MAGIC_NUMBER("application/x-gzip", "\x1F\x8B\x08"),
151   MAGIC_NUMBER("audio/x-pn-realaudio", "\x2E\x52\x4D\x46"),
152   MAGIC_NUMBER("video/x-ms-asf",
153       "\x30\x26\xB2\x75\x8E\x66\xCF\x11\xA6\xD9\x00\xAA\x00\x62\xCE\x6C"),
154   MAGIC_NUMBER("image/tiff", "I I"),
155   MAGIC_NUMBER("image/tiff", "II*"),
156   MAGIC_NUMBER("image/tiff", "MM\x00*"),
157   MAGIC_NUMBER("audio/mpeg", "ID3"),
158   MAGIC_NUMBER("image/webp", "RIFF....WEBPVP"),
159   MAGIC_NUMBER("video/webm", "\x1A\x45\xDF\xA3"),
160   MAGIC_NUMBER("application/zip", "PK\x03\x04"),
161   MAGIC_NUMBER("application/x-rar-compressed", "Rar!\x1A\x07\x00"),
162   MAGIC_NUMBER("application/x-msmetafile", "\xD7\xCD\xC6\x9A"),
163   MAGIC_NUMBER("application/octet-stream", "MZ"),  // EXE
164   // Sniffing for Flash:
165   //
166   //   MAGIC_NUMBER("application/x-shockwave-flash", "CWS"),
167   //   MAGIC_NUMBER("application/x-shockwave-flash", "FLV"),
168   //   MAGIC_NUMBER("application/x-shockwave-flash", "FWS"),
169   //
170   // Including these magic number for Flash is a trade off.
171   //
172   // Pros:
173   //   * Flash is an important and popular file format
174   //
175   // Cons:
176   //   * These patterns are fairly weak
177   //   * If we mistakenly decide something is Flash, we will execute it
178   //     in the origin of an unsuspecting site.  This could be a security
179   //     vulnerability if the site allows users to upload content.
180   //
181   // On balance, we do not include these patterns.
182 };
183 
184 // The number of content bytes we need to use all our Microsoft Office magic
185 // numbers.
186 static const size_t kBytesRequiredForOfficeMagic = 8;
187 
188 static const MagicNumber kOfficeMagicNumbers[] = {
189   MAGIC_NUMBER("CFB", "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"),
190   MAGIC_NUMBER("OOXML", "PK\x03\x04"),
191 };
192 
193 enum OfficeDocType {
194   DOC_TYPE_WORD,
195   DOC_TYPE_EXCEL,
196   DOC_TYPE_POWERPOINT,
197   DOC_TYPE_NONE
198 };
199 
200 struct OfficeExtensionType {
201   OfficeDocType doc_type;
202   const std::string_view extension;
203 };
204 
205 #define OFFICE_EXTENSION(type, extension) \
206   { (type), std::string_view((extension), sizeof(extension) - 1) }
207 
208 static const OfficeExtensionType kOfficeExtensionTypes[] = {
209   OFFICE_EXTENSION(DOC_TYPE_WORD, ".doc"),
210   OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xls"),
211   OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".ppt"),
212   OFFICE_EXTENSION(DOC_TYPE_WORD, ".docx"),
213   OFFICE_EXTENSION(DOC_TYPE_EXCEL, ".xlsx"),
214   OFFICE_EXTENSION(DOC_TYPE_POWERPOINT, ".pptx"),
215 };
216 
217 static const MagicNumber kExtraMagicNumbers[] = {
218   MAGIC_NUMBER("image/x-xbitmap", "#define"),
219   MAGIC_NUMBER("image/x-icon", "\x00\x00\x01\x00"),
220   MAGIC_NUMBER("audio/wav", "RIFF....WAVEfmt "),
221   MAGIC_NUMBER("video/avi", "RIFF....AVI LIST"),
222   MAGIC_NUMBER("audio/ogg", "OggS\0"),
223   MAGIC_MASK("video/mpeg", "\x00\x00\x01\xB0", "\xFF\xFF\xFF\xF0"),
224   MAGIC_MASK("audio/mpeg", "\xFF\xE0", "\xFF\xE0"),
225   MAGIC_NUMBER("video/3gpp", "....ftyp3g"),
226   MAGIC_NUMBER("video/3gpp", "....ftypavcl"),
227   MAGIC_NUMBER("video/mp4", "....ftyp"),
228   MAGIC_NUMBER("video/quicktime", "....moov"),
229   MAGIC_NUMBER("application/x-shockwave-flash", "CWS"),
230   MAGIC_NUMBER("application/x-shockwave-flash", "FWS"),
231   MAGIC_NUMBER("video/x-flv", "FLV"),
232   MAGIC_NUMBER("audio/x-flac", "fLaC"),
233   // Per https://tools.ietf.org/html/rfc3267#section-8.1
234   MAGIC_NUMBER("audio/amr", "#!AMR\n"),
235 
236   // RAW image types.
237   MAGIC_NUMBER("image/x-canon-cr2", "II\x2a\x00\x10\x00\x00\x00CR"),
238   MAGIC_NUMBER("image/x-canon-crw", "II\x1a\x00\x00\x00HEAPCCDR"),
239   MAGIC_NUMBER("image/x-minolta-mrw", "\x00MRM"),
240   MAGIC_NUMBER("image/x-olympus-orf", "MMOR"),  // big-endian
241   MAGIC_NUMBER("image/x-olympus-orf", "IIRO"),  // little-endian
242   MAGIC_NUMBER("image/x-olympus-orf", "IIRS"),  // little-endian
243   MAGIC_NUMBER("image/x-fuji-raf", "FUJIFILMCCD-RAW "),
244   MAGIC_NUMBER("image/x-panasonic-raw",
245                "IIU\x00\x08\x00\x00\x00"),  // Panasonic .raw
246   MAGIC_NUMBER("image/x-panasonic-raw",
247                "IIU\x00\x18\x00\x00\x00"),  // Panasonic .rw2
248   MAGIC_NUMBER("image/x-phaseone-raw", "MMMMRaw"),
249   MAGIC_NUMBER("image/x-x3f", "FOVb"),
250 };
251 
252 // Our HTML sniffer differs slightly from Mozilla.  For example, Mozilla will
253 // decide that a document that begins "<!DOCTYPE SOAP-ENV:Envelope PUBLIC " is
254 // HTML, but we will not.
255 
256 #define MAGIC_HTML_TAG(tag) \
257   MAGIC_STRING("text/html", "<" tag)
258 
259 static const MagicNumber kSniffableTags[] = {
260   // XML processing directive.  Although this is not an HTML mime type, we sniff
261   // for this in the HTML phase because text/xml is just as powerful as HTML and
262   // we want to leverage our white space skipping technology.
263   MAGIC_NUMBER("text/xml", "<?xml"),  // Mozilla
264   // DOCTYPEs
265   MAGIC_HTML_TAG("!DOCTYPE html"),  // HTML5 spec
266   // Sniffable tags, ordered by how often they occur in sniffable documents.
267   MAGIC_HTML_TAG("script"),  // HTML5 spec, Mozilla
268   MAGIC_HTML_TAG("html"),  // HTML5 spec, Mozilla
269   MAGIC_HTML_TAG("!--"),
270   MAGIC_HTML_TAG("head"),  // HTML5 spec, Mozilla
271   MAGIC_HTML_TAG("iframe"),  // Mozilla
272   MAGIC_HTML_TAG("h1"),  // Mozilla
273   MAGIC_HTML_TAG("div"),  // Mozilla
274   MAGIC_HTML_TAG("font"),  // Mozilla
275   MAGIC_HTML_TAG("table"),  // Mozilla
276   MAGIC_HTML_TAG("a"),  // Mozilla
277   MAGIC_HTML_TAG("style"),  // Mozilla
278   MAGIC_HTML_TAG("title"),  // Mozilla
279   MAGIC_HTML_TAG("b"),  // Mozilla
280   MAGIC_HTML_TAG("body"),  // Mozilla
281   MAGIC_HTML_TAG("br"),
282   MAGIC_HTML_TAG("p"),  // Mozilla
283 };
284 
285 // Compare content header to a magic number where magic_entry can contain '.'
286 // for single character of anything, allowing some bytes to be skipped.
MagicCmp(std::string_view content,std::string_view magic_entry)287 static bool MagicCmp(std::string_view content, std::string_view magic_entry) {
288   DCHECK_GE(content.length(), magic_entry.length());
289 
290   for (size_t i = 0; i < magic_entry.length(); ++i) {
291     if (magic_entry[i] != '.' && magic_entry[i] != content[i])
292       return false;
293   }
294   return true;
295 }
296 
297 // Like MagicCmp() except that it ANDs each byte with a mask before
298 // the comparison, because there are some bits we don't care about.
MagicMaskCmp(std::string_view content,std::string_view magic_entry,std::string_view magic_mask)299 static bool MagicMaskCmp(std::string_view content,
300                          std::string_view magic_entry,
301                          std::string_view magic_mask) {
302   DCHECK_GE(content.length(), magic_entry.length());
303 
304   for (size_t i = 0; i < magic_entry.length(); ++i) {
305     if (magic_entry[i] != '.' && magic_entry[i] != (magic_mask[i] & content[i]))
306       return false;
307   }
308   return true;
309 }
310 
MatchMagicNumber(std::string_view content,const MagicNumber & magic_entry,std::string * result)311 static bool MatchMagicNumber(std::string_view content,
312                              const MagicNumber& magic_entry,
313                              std::string* result) {
314   // Keep kBytesRequiredForMagic honest.
315   DCHECK_LE(magic_entry.magic.length(), kBytesRequiredForMagic);
316 
317   bool match = false;
318   if (content.length() >= magic_entry.magic.length()) {
319     if (magic_entry.is_string) {
320       // Consistency check - string entries should have no embedded nulls.
321       DCHECK_EQ(std::string_view::npos, magic_entry.magic.find('\0'));
322 
323       // Do a case-insensitive prefix comparison.
324       match = base::StartsWith(content, magic_entry.magic,
325                                base::CompareCase::INSENSITIVE_ASCII);
326     } else if (!magic_entry.mask) {
327       match = MagicCmp(content, magic_entry.magic);
328     } else {
329       std::string_view magic_mask(magic_entry.mask, magic_entry.magic.length());
330       match = MagicMaskCmp(content, magic_entry.magic, magic_mask);
331     }
332   }
333 
334   if (match) {
335     result->assign(magic_entry.mime_type);
336     return true;
337   }
338   return false;
339 }
340 
CheckForMagicNumbers(std::string_view content,base::span<const MagicNumber> magic_numbers,std::string * result)341 static bool CheckForMagicNumbers(std::string_view content,
342                                  base::span<const MagicNumber> magic_numbers,
343                                  std::string* result) {
344   for (const MagicNumber& magic : magic_numbers) {
345     if (MatchMagicNumber(content, magic, result))
346       return true;
347   }
348   return false;
349 }
350 
351 // Truncates |string_piece| to length |max_size| and returns true if
352 // |string_piece| is now exactly |max_size|.
TruncateStringPiece(const size_t max_size,std::string_view * string_piece)353 static bool TruncateStringPiece(const size_t max_size,
354                                 std::string_view* string_piece) {
355   // Keep kMaxBytesToSniff honest.
356   DCHECK_LE(static_cast<int>(max_size), kMaxBytesToSniff);
357 
358   *string_piece = string_piece->substr(0, max_size);
359   return string_piece->length() == max_size;
360 }
361 
362 // Returns true and sets result if the content appears to be HTML.
363 // Clears have_enough_content if more data could possibly change the result.
SniffForHTML(std::string_view content,bool * have_enough_content,std::string * result)364 static bool SniffForHTML(std::string_view content,
365                          bool* have_enough_content,
366                          std::string* result) {
367   // For HTML, we are willing to consider up to 512 bytes. This may be overly
368   // conservative as IE only considers 256.
369   *have_enough_content &= TruncateStringPiece(512, &content);
370 
371   // We adopt a strategy similar to that used by Mozilla to sniff HTML tags,
372   // but with some modifications to better match the HTML5 spec.
373   std::string_view trimmed =
374       base::TrimWhitespaceASCII(content, base::TRIM_LEADING);
375 
376   // |trimmed| now starts at first non-whitespace character (or is empty).
377   return CheckForMagicNumbers(trimmed, kSniffableTags, result);
378 }
379 
380 // Returns true and sets result if the content matches any of kMagicNumbers.
381 // Clears have_enough_content if more data could possibly change the result.
SniffForMagicNumbers(std::string_view content,bool * have_enough_content,std::string * result)382 static bool SniffForMagicNumbers(std::string_view content,
383                                  bool* have_enough_content,
384                                  std::string* result) {
385   *have_enough_content &= TruncateStringPiece(kBytesRequiredForMagic, &content);
386 
387   // Check our big table of Magic Numbers
388   return CheckForMagicNumbers(content, kMagicNumbers, result);
389 }
390 
391 // Returns true and sets result if the content matches any of
392 // kOfficeMagicNumbers, and the URL has the proper extension.
393 // Clears |have_enough_content| if more data could possibly change the result.
SniffForOfficeDocs(std::string_view content,const GURL & url,bool * have_enough_content,std::string * result)394 static bool SniffForOfficeDocs(std::string_view content,
395                                const GURL& url,
396                                bool* have_enough_content,
397                                std::string* result) {
398   *have_enough_content &=
399       TruncateStringPiece(kBytesRequiredForOfficeMagic, &content);
400 
401   // Check our table of magic numbers for Office file types.
402   std::string office_version;
403   if (!CheckForMagicNumbers(content, kOfficeMagicNumbers, &office_version))
404     return false;
405 
406   OfficeDocType type = DOC_TYPE_NONE;
407   std::string_view url_path = url.path_piece();
408   for (const auto& office_extension : kOfficeExtensionTypes) {
409     if (base::EndsWith(url_path, office_extension.extension,
410                        base::CompareCase::INSENSITIVE_ASCII)) {
411       type = office_extension.doc_type;
412       break;
413     }
414   }
415 
416   if (type == DOC_TYPE_NONE)
417     return false;
418 
419   if (office_version == "CFB") {
420     switch (type) {
421       case DOC_TYPE_WORD:
422         *result = "application/msword";
423         return true;
424       case DOC_TYPE_EXCEL:
425         *result = "application/vnd.ms-excel";
426         return true;
427       case DOC_TYPE_POWERPOINT:
428         *result = "application/vnd.ms-powerpoint";
429         return true;
430       case DOC_TYPE_NONE:
431         NOTREACHED();
432     }
433   } else if (office_version == "OOXML") {
434     switch (type) {
435       case DOC_TYPE_WORD:
436         *result = "application/vnd.openxmlformats-officedocument."
437                   "wordprocessingml.document";
438         return true;
439       case DOC_TYPE_EXCEL:
440         *result = "application/vnd.openxmlformats-officedocument."
441                   "spreadsheetml.sheet";
442         return true;
443       case DOC_TYPE_POWERPOINT:
444         *result = "application/vnd.openxmlformats-officedocument."
445                   "presentationml.presentation";
446         return true;
447       case DOC_TYPE_NONE:
448         NOTREACHED();
449     }
450   }
451 
452   NOTREACHED();
453 }
454 
IsOfficeType(const std::string & type_hint)455 static bool IsOfficeType(const std::string& type_hint) {
456   return (type_hint == "application/msword" ||
457           type_hint == "application/vnd.ms-excel" ||
458           type_hint == "application/vnd.ms-powerpoint" ||
459           type_hint == "application/vnd.openxmlformats-officedocument."
460                        "wordprocessingml.document" ||
461           type_hint == "application/vnd.openxmlformats-officedocument."
462                        "spreadsheetml.sheet" ||
463           type_hint == "application/vnd.openxmlformats-officedocument."
464                        "presentationml.presentation" ||
465           type_hint == "application/vnd.ms-excel.sheet.macroenabled.12" ||
466           type_hint == "application/vnd.ms-word.document.macroenabled.12" ||
467           type_hint == "application/vnd.ms-powerpoint.presentation."
468                        "macroenabled.12" ||
469           type_hint == "application/mspowerpoint" ||
470           type_hint == "application/msexcel" ||
471           type_hint == "application/vnd.ms-word" ||
472           type_hint == "application/vnd.ms-word.document.12" ||
473           type_hint == "application/vnd.msword");
474 }
475 
476 // This function checks for files that have a Microsoft Office MIME type
477 // set, but are not actually Office files.
478 //
479 // If this is not actually an Office file, |*result| is set to
480 // "application/octet-stream", otherwise it is not modified.
481 //
482 // Returns false if additional data is required to determine the file type, or
483 // true if there is enough data to make a decision.
SniffForInvalidOfficeDocs(std::string_view content,const GURL & url,std::string * result)484 static bool SniffForInvalidOfficeDocs(std::string_view content,
485                                       const GURL& url,
486                                       std::string* result) {
487   if (!TruncateStringPiece(kBytesRequiredForOfficeMagic, &content))
488     return false;
489 
490   // Check our table of magic numbers for Office file types.  If it does not
491   // match one, the MIME type was invalid.  Set it instead to a safe value.
492   std::string office_version;
493   if (!CheckForMagicNumbers(content, kOfficeMagicNumbers, &office_version)) {
494     *result = "application/octet-stream";
495   }
496 
497   // We have enough information to determine if this was a Microsoft Office
498   // document or not, so sniffing is completed.
499   return true;
500 }
501 
502 // Tags that indicate the content is likely XML.
503 static const MagicNumber kMagicXML[] = {
504     MAGIC_STRING("application/atom+xml", "<feed"),
505     MAGIC_STRING("application/rss+xml", "<rss"),
506 };
507 
508 // Returns true and sets result if the content appears to contain XHTML or a
509 // feed.
510 // Clears have_enough_content if more data could possibly change the result.
511 //
512 // TODO(evanm): this is similar but more conservative than what Safari does,
513 // while HTML5 has a different recommendation -- what should we do?
514 // TODO(evanm): this is incorrect for documents whose encoding isn't a superset
515 // of ASCII -- do we care?
SniffXML(std::string_view content,bool * have_enough_content,std::string * result)516 static bool SniffXML(std::string_view content,
517                      bool* have_enough_content,
518                      std::string* result) {
519   // We allow at most 300 bytes of content before we expect the opening tag.
520   *have_enough_content &= TruncateStringPiece(300, &content);
521 
522   // This loop iterates through tag-looking offsets in the file.
523   // We want to skip XML processing instructions (of the form "<?xml ...")
524   // and stop at the first "plain" tag, then make a decision on the mime-type
525   // based on the name (or possibly attributes) of that tag.
526   const int kMaxTagIterations = 5;
527   size_t pos = 0;
528   for (size_t i = 0; i < kMaxTagIterations && pos < content.length(); ++i) {
529     pos = content.find('<', pos);
530     if (pos == std::string_view::npos) {
531       return false;
532     }
533 
534     std::string_view current = content.substr(pos);
535 
536     // Skip XML and DOCTYPE declarations.
537     static constexpr std::string_view kXmlPrefix("<?xml");
538     static constexpr std::string_view kDocTypePrefix("<!DOCTYPE");
539     if (base::StartsWith(current, kXmlPrefix,
540                          base::CompareCase::INSENSITIVE_ASCII) ||
541         base::StartsWith(current, kDocTypePrefix,
542                          base::CompareCase::INSENSITIVE_ASCII)) {
543       ++pos;
544       continue;
545     }
546 
547     if (CheckForMagicNumbers(current, kMagicXML, result))
548       return true;
549 
550     // TODO(evanm): handle RSS 1.0, which is an RDF format and more difficult
551     // to identify.
552 
553     // If we get here, we've hit an initial tag that hasn't matched one of the
554     // above tests.  Abort.
555     return true;
556   }
557 
558   // We iterated too far without finding a start tag.
559   // If we have more content to look at, we aren't going to change our mind by
560   // seeing more bytes from the network.
561   return pos < content.length();
562 }
563 
564 // Byte order marks
565 static const MagicNumber kByteOrderMark[] = {
566   MAGIC_NUMBER("text/plain", "\xFE\xFF"),  // UTF-16BE
567   MAGIC_NUMBER("text/plain", "\xFF\xFE"),  // UTF-16LE
568   MAGIC_NUMBER("text/plain", "\xEF\xBB\xBF"),  // UTF-8
569 };
570 
571 // Returns true and sets result to "application/octet-stream" if the content
572 // appears to be binary data. Otherwise, returns false and sets "text/plain".
573 // Clears have_enough_content if more data could possibly change the result.
SniffBinary(std::string_view content,bool * have_enough_content,std::string * result)574 static bool SniffBinary(std::string_view content,
575                         bool* have_enough_content,
576                         std::string* result) {
577   // There is no consensus about exactly how to sniff for binary content.
578   // * IE 7: Don't sniff for binary looking bytes, but trust the file extension.
579   // * Firefox 3.5: Sniff first 4096 bytes for a binary looking byte.
580   // Here, we side with FF, but with a smaller buffer. This size was chosen
581   // because it is small enough to comfortably fit into a single packet (after
582   // allowing for headers) and yet large enough to account for binary formats
583   // that have a significant amount of ASCII at the beginning (crbug.com/15314).
584   const bool is_truncated = TruncateStringPiece(kMaxBytesToSniff, &content);
585 
586   // First, we look for a BOM.
587   std::string unused;
588   if (CheckForMagicNumbers(content, kByteOrderMark, &unused)) {
589     // If there is BOM, we think the buffer is not binary.
590     result->assign("text/plain");
591     return false;
592   }
593 
594   // Next we look to see if any of the bytes "look binary."
595   if (LooksLikeBinary(content)) {
596     result->assign("application/octet-stream");
597     return true;
598   }
599 
600   // No evidence either way. Default to non-binary and, if truncated, clear
601   // have_enough_content because there could be a binary looking byte in the
602   // truncated data.
603   *have_enough_content &= is_truncated;
604   result->assign("text/plain");
605   return false;
606 }
607 
IsUnknownMimeType(std::string_view mime_type)608 static bool IsUnknownMimeType(std::string_view mime_type) {
609   // TODO(tc): Maybe reuse some code in net/http/http_response_headers.* here.
610   // If we do, please be careful not to alter the semantics at all.
611   static const char* const kUnknownMimeTypes[] = {
612     // Empty mime types are as unknown as they get.
613     "",
614     // The unknown/unknown type is popular and uninformative
615     "unknown/unknown",
616     // The second most popular unknown mime type is application/unknown
617     "application/unknown",
618     // Firefox rejects a mime type if it is exactly */*
619     "*/*",
620   };
621   for (const char* const unknown_mime_type : kUnknownMimeTypes) {
622     if (mime_type == unknown_mime_type)
623       return true;
624   }
625   if (mime_type.find('/') == std::string_view::npos) {
626     // Firefox rejects a mime type if it does not contain a slash
627     return true;
628   }
629   return false;
630 }
631 
632 // Returns true and sets result if the content appears to be a crx (Chrome
633 // extension) file.
634 // Clears have_enough_content if more data could possibly change the result.
SniffCRX(std::string_view content,const GURL & url,bool * have_enough_content,std::string * result)635 static bool SniffCRX(std::string_view content,
636                      const GURL& url,
637                      bool* have_enough_content,
638                      std::string* result) {
639   // Technically, the crx magic number is just Cr24, but the bytes after that
640   // are a version number which changes infrequently. Including it in the
641   // sniffing gives us less room for error. If the version number ever changes,
642   // we can just add an entry to this list.
643   static const struct MagicNumber kCRXMagicNumbers[] = {
644       MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x02\x00\x00\x00"),
645       MAGIC_NUMBER("application/x-chrome-extension", "Cr24\x03\x00\x00\x00")};
646 
647   // Only consider files that have the extension ".crx".
648   if (!url.path_piece().ends_with(".crx")) {
649     return false;
650   }
651 
652   *have_enough_content &= TruncateStringPiece(kBytesRequiredForMagic, &content);
653   return CheckForMagicNumbers(content, kCRXMagicNumbers, result);
654 }
655 
ShouldSniffMimeType(const GURL & url,std::string_view mime_type)656 bool ShouldSniffMimeType(const GURL& url, std::string_view mime_type) {
657   bool sniffable_scheme = url.is_empty() || url.SchemeIsHTTPOrHTTPS() ||
658 #if BUILDFLAG(IS_ANDROID)
659                           url.SchemeIs("content") ||
660 #endif
661                           url.SchemeIsFile() || url.SchemeIsFileSystem();
662   if (!sniffable_scheme)
663     return false;
664 
665   static const char* const kSniffableTypes[] = {
666     // Many web servers are misconfigured to send text/plain for many
667     // different types of content.
668     "text/plain",
669     // We want to sniff application/octet-stream for
670     // application/x-chrome-extension, but nothing else.
671     "application/octet-stream",
672     // XHTML and Atom/RSS feeds are often served as plain xml instead of
673     // their more specific mime types.
674     "text/xml",
675     "application/xml",
676     // Check for false Microsoft Office MIME types.
677     "application/msword",
678     "application/vnd.ms-excel",
679     "application/vnd.ms-powerpoint",
680     "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
681     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
682     "application/vnd.openxmlformats-officedocument.presentationml.presentation",
683     "application/vnd.ms-excel.sheet.macroenabled.12",
684     "application/vnd.ms-word.document.macroenabled.12",
685     "application/vnd.ms-powerpoint.presentation.macroenabled.12",
686     "application/mspowerpoint",
687     "application/msexcel",
688     "application/vnd.ms-word",
689     "application/vnd.ms-word.document.12",
690     "application/vnd.msword",
691   };
692   for (const char* const sniffable_type : kSniffableTypes) {
693     if (mime_type == sniffable_type)
694       return true;
695   }
696   if (IsUnknownMimeType(mime_type)) {
697     // The web server didn't specify a content type or specified a mime
698     // type that we ignore.
699     return true;
700   }
701   return false;
702 }
703 
SniffMimeType(std::string_view content,const GURL & url,const std::string & type_hint,ForceSniffFileUrlsForHtml force_sniff_file_url_for_html,std::string * result)704 bool SniffMimeType(std::string_view content,
705                    const GURL& url,
706                    const std::string& type_hint,
707                    ForceSniffFileUrlsForHtml force_sniff_file_url_for_html,
708                    std::string* result) {
709   // Sanity check.
710   DCHECK_LT(content.length(), 1000000U);
711   DCHECK(result);
712 
713   // By default, we assume we have enough content.
714   // Each sniff routine may unset this if it wasn't provided enough content.
715   bool have_enough_content = true;
716 
717   // By default, we'll return the type hint.
718   // Each sniff routine may modify this if it has a better guess..
719   result->assign(type_hint);
720 
721   // If the file has a Microsoft Office MIME type, we should only check that it
722   // is a valid Office file.  Because this is the only reason we sniff files
723   // with a Microsoft Office MIME type, we can return early.
724   if (IsOfficeType(type_hint))
725     return SniffForInvalidOfficeDocs(content, url, result);
726 
727   // Cache information about the type_hint
728   bool hint_is_unknown_mime_type = IsUnknownMimeType(type_hint);
729 
730   // First check for HTML, unless it's a file URL and
731   // |allow_sniffing_files_urls_as_html| is false.
732   if (hint_is_unknown_mime_type &&
733       (!url.SchemeIsFile() ||
734        force_sniff_file_url_for_html == ForceSniffFileUrlsForHtml::kEnabled)) {
735     // We're only willing to sniff HTML if the server has not supplied a mime
736     // type, or if the type it did supply indicates that it doesn't know what
737     // the type should be.
738     if (SniffForHTML(content, &have_enough_content, result))
739       return true;  // We succeeded in sniffing HTML.  No more content needed.
740   }
741 
742   // We're only willing to sniff for binary in 3 cases:
743   // 1. The server has not supplied a mime type.
744   // 2. The type it did supply indicates that it doesn't know what the type
745   //    should be.
746   // 3. The type is "text/plain" which is the default on some web servers and
747   //    could be indicative of a mis-configuration that we shield the user from.
748   const bool hint_is_text_plain = (type_hint == "text/plain");
749   if (hint_is_unknown_mime_type || hint_is_text_plain) {
750     if (!SniffBinary(content, &have_enough_content, result)) {
751       // If the server said the content was text/plain and it doesn't appear
752       // to be binary, then we trust it.
753       if (hint_is_text_plain) {
754         return have_enough_content;
755       }
756     }
757   }
758 
759   // If we have plain XML, sniff XML subtypes.
760   if (type_hint == "text/xml" || type_hint == "application/xml") {
761     // We're not interested in sniffing these types for images and the like.
762     // Instead, we're looking explicitly for a feed.  If we don't find one
763     // we're done and return early.
764     if (SniffXML(content, &have_enough_content, result))
765       return true;
766     return have_enough_content;
767   }
768 
769   // CRX files (Chrome extensions) have a special sniffing algorithm. It is
770   // tighter than the others because we don't have to match legacy behavior.
771   if (SniffCRX(content, url, &have_enough_content, result))
772     return true;
773 
774   // Check the file extension and magic numbers to see if this is an Office
775   // document.  This needs to be checked before the general magic numbers
776   // because zip files and Office documents (OOXML) have the same magic number.
777   if (SniffForOfficeDocs(content, url, &have_enough_content, result)) {
778     return true;  // We've matched a magic number.  No more content needed.
779   }
780 
781   // We're not interested in sniffing for magic numbers when the type_hint
782   // is application/octet-stream.  Time to bail out.
783   if (type_hint == "application/octet-stream")
784     return have_enough_content;
785 
786   // Now we look in our large table of magic numbers to see if we can find
787   // anything that matches the content.
788   if (SniffForMagicNumbers(content, &have_enough_content, result))
789     return true;  // We've matched a magic number.  No more content needed.
790 
791   return have_enough_content;
792 }
793 
SniffMimeTypeFromLocalData(std::string_view content,std::string * result)794 bool SniffMimeTypeFromLocalData(std::string_view content, std::string* result) {
795   // First check the extra table.
796   if (CheckForMagicNumbers(content, kExtraMagicNumbers, result))
797     return true;
798   // Finally check the original table.
799   return CheckForMagicNumbers(content, kMagicNumbers, result);
800 }
801 
LooksLikeBinary(std::string_view content)802 bool LooksLikeBinary(std::string_view content) {
803   // The definition of "binary bytes" is from the spec at
804   // https://mimesniff.spec.whatwg.org/#binary-data-byte
805   //
806   // The bytes which are considered to be "binary" are all < 0x20. Encode them
807   // one bit per byte, with 1 for a "binary" bit, and 0 for a "text" bit. The
808   // least-significant bit represents byte 0x00, the most-significant bit
809   // represents byte 0x1F.
810   const uint32_t kBinaryBits =
811       ~(1u << '\t' | 1u << '\n' | 1u << '\r' | 1u << '\f' | 1u << '\x1b');
812   for (char c : content) {
813     uint8_t byte = static_cast<uint8_t>(c);
814     if (byte < 0x20 && (kBinaryBits & (1u << byte)))
815       return true;
816   }
817   return false;
818 }
819 
820 }  // namespace net
821